diff --git a/include/gauxc/basisset.hpp b/include/gauxc/basisset.hpp index 7ad1f105..0e0c659d 100644 --- a/include/gauxc/basisset.hpp +++ b/include/gauxc/basisset.hpp @@ -136,6 +136,11 @@ struct BasisSet : public std::vector> { return _nbf; } + inline int32_t max_l() const { + return std::max_element(this->cbegin(), this->cend(), + [](const auto& a, const auto& b) { return a.l() < b.l(); })->l(); + } + }; // class BasisSet } // namespace GauXC diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_angular_cartesian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_angular_cartesian.hpp index fa5a545d..dba51eac 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_angular_cartesian.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_angular_cartesian.hpp @@ -227,6 +227,99 @@ GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular_3_deriv1( } +template +GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular_4( + int32_t npts, + const T bf, + const T x, + const T y, + const T z, + T* __restrict__ eval +) { + + eval[npts * 0] = bf*x*x*x*x; + eval[npts * 1] = bf*x*x*x*y; + eval[npts * 2] = bf*x*x*x*z; + eval[npts * 3] = bf*x*x*y*y; + eval[npts * 4] = bf*x*x*y*z; + eval[npts * 5] = bf*x*x*z*z; + eval[npts * 6] = bf*x*y*y*y; + eval[npts * 7] = bf*x*y*y*z; + eval[npts * 8] = bf*x*y*z*z; + eval[npts * 9] = bf*x*z*z*z; + eval[npts * 10] = bf*y*y*y*y; + eval[npts * 11] = bf*y*y*y*z; + eval[npts * 12] = bf*y*y*z*z; + eval[npts * 13] = bf*y*z*z*z; + eval[npts * 14] = bf*z*z*z*z; + +} + +template +GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular_4_deriv1( + const int32_t npts, + const T bf, + const T bf_x, + const T bf_y, + const T bf_z, + const T x, + const T y, + const T z, + T* __restrict__ eval_x, + T* __restrict__ eval_y, + T* __restrict__ eval_z +) { + + eval_x[npts * 0] = x*x*x*(4*bf + bf_x*x); + eval_x[npts * 1] = x*x*y*(3*bf + bf_x*x); + eval_x[npts * 2] = x*x*z*(3*bf + bf_x*x); + eval_x[npts * 3] = x*y*y*(2*bf + bf_x*x); + eval_x[npts * 4] = x*y*z*(2*bf + bf_x*x); + eval_x[npts * 5] = x*z*z*(2*bf + bf_x*x); + eval_x[npts * 6] = y*y*y*(bf + bf_x*x); + eval_x[npts * 7] = y*y*z*(bf + bf_x*x); + eval_x[npts * 8] = y*z*z*(bf + bf_x*x); + eval_x[npts * 9] = z*z*z*(bf + bf_x*x); + eval_x[npts * 10] = bf_x*y*y*y*y; + eval_x[npts * 11] = bf_x*y*y*y*z; + eval_x[npts * 12] = bf_x*y*y*z*z; + eval_x[npts * 13] = bf_x*y*z*z*z; + eval_x[npts * 14] = bf_x*z*z*z*z; + + eval_y[npts * 0] = bf_y*x*x*x*x; + eval_y[npts * 1] = x*x*x*(bf + bf_y*y); + eval_y[npts * 2] = bf_y*x*x*x*z; + eval_y[npts * 3] = x*x*y*(2*bf + bf_y*y); + eval_y[npts * 4] = x*x*z*(bf + bf_y*y); + eval_y[npts * 5] = bf_y*x*x*z*z; + eval_y[npts * 6] = x*y*y*(3*bf + bf_y*y); + eval_y[npts * 7] = x*y*z*(2*bf + bf_y*y); + eval_y[npts * 8] = x*z*z*(bf + bf_y*y); + eval_y[npts * 9] = bf_y*x*z*z*z; + eval_y[npts * 10] = y*y*y*(4*bf + bf_y*y); + eval_y[npts * 11] = y*y*z*(3*bf + bf_y*y); + eval_y[npts * 12] = y*z*z*(2*bf + bf_y*y); + eval_y[npts * 13] = z*z*z*(bf + bf_y*y); + eval_y[npts * 14] = bf_y*z*z*z*z; + + eval_z[npts * 0] = bf_z*x*x*x*x; + eval_z[npts * 1] = bf_z*x*x*x*y; + eval_z[npts * 2] = x*x*x*(bf + bf_z*z); + eval_z[npts * 3] = bf_z*x*x*y*y; + eval_z[npts * 4] = x*x*y*(bf + bf_z*z); + eval_z[npts * 5] = x*x*z*(2*bf + bf_z*z); + eval_z[npts * 6] = bf_z*x*y*y*y; + eval_z[npts * 7] = x*y*y*(bf + bf_z*z); + eval_z[npts * 8] = x*y*z*(2*bf + bf_z*z); + eval_z[npts * 9] = x*z*z*(3*bf + bf_z*z); + eval_z[npts * 10] = bf_z*y*y*y*y; + eval_z[npts * 11] = y*y*y*(bf + bf_z*z); + eval_z[npts * 12] = y*y*z*(2*bf + bf_z*z); + eval_z[npts * 13] = y*z*z*(3*bf + bf_z*z); + eval_z[npts * 14] = z*z*z*(4*bf + bf_z*z); + +} + template GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular( @@ -255,6 +348,10 @@ GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular( collocation_cartesian_angular_3( npts, bf, x, y, z, eval ); + } else if( l == 4 ) { + + collocation_cartesian_angular_4( npts, bf, x, y, z, eval ); + } else { assert( false && "L < L_MAX" ); } @@ -300,6 +397,11 @@ GPGAUEVAL_INLINE __device__ void collocation_cartesian_angular_deriv1( collocation_cartesian_angular_3( npts, bf, x, y, z, eval ); collocation_cartesian_angular_3_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); + } else if( l == 4 ) { + + collocation_cartesian_angular_4( npts, bf, x, y, z, eval ); + collocation_cartesian_angular_4_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); + } else { assert( false && "L < L_MAX" ); } diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_angular_spherical_unnorm.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_angular_spherical_unnorm.hpp index e6f102d6..968cc3c8 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_angular_spherical_unnorm.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_angular_spherical_unnorm.hpp @@ -187,17 +187,17 @@ GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_3_deriv1( eval_x[npts * 0] = sqrt_10*y*(6*bf*x + bf_x*(3*x*x - y*y))/4; eval_x[npts * 1] = sqrt_15*y*z*(bf + bf_x*x); - eval_x[npts * 2] = -sqrt_6*y*(2*bf*x + bf_x*(x*x + y*y - 4*z*z))/4; - eval_x[npts * 3] = -z*(6*bf*x + bf_x*(3*x*x + 3*y*y - 2*z*z))/2; - eval_x[npts * 4] = -sqrt_6*(bf*(3*x*x + y*y - 4*z*z) + bf_x*x*(x*x + y*y - 4*z*z))/4; + eval_x[npts * 2] = sqrt_6*y*(-2*bf*x - bf_x*(x*x + y*y - 4*z*z))/4; + eval_x[npts * 3] = z*(-6*bf*x - bf_x*(3*x*x + 3*y*y - 2*z*z))/2; + eval_x[npts * 4] = sqrt_6*(-bf*(3*x*x + y*y - 4*z*z) - bf_x*x*(x*x + y*y - 4*z*z))/4; eval_x[npts * 5] = sqrt_15*z*(2*bf*x + bf_x*(x*x - y*y))/2; eval_x[npts * 6] = sqrt_10*(3*bf*(x*x - y*y) + bf_x*x*(x*x - 3*y*y))/4; eval_y[npts * 0] = sqrt_10*(-3*bf*(-x*x + y*y) + bf_y*y*(3*x*x - y*y))/4; eval_y[npts * 1] = sqrt_15*x*z*(bf + bf_y*y); - eval_y[npts * 2] = -sqrt_6*(bf*(x*x + 3*y*y - 4*z*z) + bf_y*y*(x*x + y*y - 4*z*z))/4; - eval_y[npts * 3] = -z*(6*bf*y + bf_y*(3*x*x + 3*y*y - 2*z*z))/2; - eval_y[npts * 4] = -sqrt_6*x*(2*bf*y + bf_y*(x*x + y*y - 4*z*z))/4; + eval_y[npts * 2] = sqrt_6*(-bf*(x*x + 3*y*y - 4*z*z) - bf_y*y*(x*x + y*y - 4*z*z))/4; + eval_y[npts * 3] = z*(-6*bf*y - bf_y*(3*x*x + 3*y*y - 2*z*z))/2; + eval_y[npts * 4] = sqrt_6*x*(-2*bf*y - bf_y*(x*x + y*y - 4*z*z))/4; eval_y[npts * 5] = sqrt_15*z*(-2*bf*y + bf_y*(x*x - y*y))/2; eval_y[npts * 6] = sqrt_10*x*(-6*bf*y + bf_y*(x*x - 3*y*y))/4; @@ -211,6 +211,75 @@ GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_3_deriv1( } +template +GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_4( + int32_t npts, + const T bf, + const T x, + const T y, + const T z, + T* __restrict__ eval +) { + + eval[npts * 0] = sqrt_35*bf*x*y*(x*x - y*y)/2; + eval[npts * 1] = sqrt_70*bf*y*z*(3*x*x - y*y)/4; + eval[npts * 2] = sqrt_5*bf*x*y*(-x*x - y*y + 6*z*z)/2; + eval[npts * 3] = sqrt_10*bf*y*z*(-3*x*x - 3*y*y + 4*z*z)/4; + eval[npts * 4] = bf*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; + eval[npts * 5] = sqrt_10*bf*x*z*(-3*x*x - 3*y*y + 4*z*z)/4; + eval[npts * 6] = sqrt_5*bf*(-x*x*x*x + 6*x*x*z*z + y*y*y*y - 6*y*y*z*z)/4; + eval[npts * 7] = sqrt_70*bf*x*z*(x*x - 3*y*y)/4; + eval[npts * 8] = sqrt_35*bf*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + +} + +template +GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_4_deriv1( + const int32_t npts, + const T bf, + const T bf_x, + const T bf_y, + const T bf_z, + const T x, + const T y, + const T z, + T* __restrict__ eval_x, + T* __restrict__ eval_y, + T* __restrict__ eval_z +) { + + eval_x[npts * 0] = sqrt_35*y*(bf*(3*x*x - y*y) + bf_x*x*(x*x - y*y))/2; + eval_x[npts * 1] = sqrt_70*y*z*(6*bf*x + bf_x*(3*x*x - y*y))/4; + eval_x[npts * 2] = sqrt_5*y*(-bf*(3*x*x + y*y - 6*z*z) - bf_x*x*(x*x + y*y - 6*z*z))/2; + eval_x[npts * 3] = sqrt_10*y*z*(-6*bf*x - bf_x*(3*x*x + 3*y*y - 4*z*z))/4; + eval_x[npts * 4] = 3*bf*x*(x*x + y*y - 4*z*z)/2 + bf_x*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; + eval_x[npts * 5] = sqrt_10*z*(-bf*(9*x*x + 3*y*y - 4*z*z) - bf_x*x*(3*x*x + 3*y*y - 4*z*z))/4; + eval_x[npts * 6] = sqrt_5*(-bf*x*(x*x - 3*z*z) - bf_x*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z)/4); + eval_x[npts * 7] = sqrt_70*z*(3*bf*(x*x - y*y) + bf_x*x*(x*x - 3*y*y))/4; + eval_x[npts * 8] = sqrt_35*(4*bf*x*(x*x - 3*y*y) + bf_x*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + + eval_y[npts * 0] = sqrt_35*x*(-bf*(-x*x + 3*y*y) + bf_y*y*(x*x - y*y))/2; + eval_y[npts * 1] = sqrt_70*z*(-3*bf*(-x*x + y*y) + bf_y*y*(3*x*x - y*y))/4; + eval_y[npts * 2] = sqrt_5*x*(-bf*(x*x + 3*y*y - 6*z*z) - bf_y*y*(x*x + y*y - 6*z*z))/2; + eval_y[npts * 3] = sqrt_10*z*(-bf*(3*x*x + 9*y*y - 4*z*z) - bf_y*y*(3*x*x + 3*y*y - 4*z*z))/4; + eval_y[npts * 4] = 3*bf*y*(x*x + y*y - 4*z*z)/2 + bf_y*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; + eval_y[npts * 5] = sqrt_10*x*z*(-6*bf*y - bf_y*(3*x*x + 3*y*y - 4*z*z))/4; + eval_y[npts * 6] = sqrt_5*(bf*y*(y*y - 3*z*z) - bf_y*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z)/4); + eval_y[npts * 7] = sqrt_70*x*z*(-6*bf*y + bf_y*(x*x - 3*y*y))/4; + eval_y[npts * 8] = sqrt_35*(-4*bf*y*(3*x*x - y*y) + bf_y*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + + eval_z[npts * 0] = sqrt_35*bf_z*x*y*(x*x - y*y)/2; + eval_z[npts * 1] = sqrt_70*y*(bf + bf_z*z)*(3*x*x - y*y)/4; + eval_z[npts * 2] = sqrt_5*x*y*(12*bf*z - bf_z*(x*x + y*y - 6*z*z))/2; + eval_z[npts * 3] = sqrt_10*y*(3*bf*(-x*x - y*y + 4*z*z) - bf_z*z*(3*x*x + 3*y*y - 4*z*z))/4; + eval_z[npts * 4] = -2*bf*z*(3*x*x + 3*y*y - 2*z*z) + bf_z*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; + eval_z[npts * 5] = sqrt_10*x*(3*bf*(-x*x - y*y + 4*z*z) - bf_z*z*(3*x*x + 3*y*y - 4*z*z))/4; + eval_z[npts * 6] = sqrt_5*(12*bf*z*(x*x - y*y) - bf_z*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + eval_z[npts * 7] = sqrt_70*x*(bf + bf_z*z)*(x*x - 3*y*y)/4; + eval_z[npts * 8] = sqrt_35*bf_z*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + +} + template GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular( @@ -239,6 +308,10 @@ GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular( collocation_spherical_unnorm_angular_3( npts, bf, x, y, z, eval ); + } else if( l == 4 ) { + + collocation_spherical_unnorm_angular_4( npts, bf, x, y, z, eval ); + } else { assert( false && "L < L_MAX" ); } @@ -284,6 +357,11 @@ GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_deriv1( collocation_spherical_unnorm_angular_3( npts, bf, x, y, z, eval ); collocation_spherical_unnorm_angular_3_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); + } else if( l == 4 ) { + + collocation_spherical_unnorm_angular_4( npts, bf, x, y, z, eval ); + collocation_spherical_unnorm_angular_4_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); + } else { assert( false && "L < L_MAX" ); } diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_device_constants.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_device_constants.hpp index 98d180dc..d265a8d4 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_device_constants.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_device_constants.hpp @@ -9,9 +9,12 @@ namespace GauXC { - constexpr double sqrt_15 = 3.872983346207417; + constexpr double sqrt_10 = 3.1622776601683795; constexpr double sqrt_3 = 1.7320508075688772; + constexpr double sqrt_15 = 3.872983346207417; + constexpr double sqrt_35 = 5.916079783099616; constexpr double sqrt_6 = 2.449489742783178; - constexpr double sqrt_10 = 3.1622776601683795; + constexpr double sqrt_70 = 8.366600265340756; + constexpr double sqrt_5 = 2.23606797749979; } // namespace GauXC diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4.hpp new file mode 100644 index 00000000..2a0f65d9 --- /dev/null +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4.hpp @@ -0,0 +1,175 @@ +/** + * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * + * See LICENSE.txt for details + */ +#pragma once +#include "collocation_device_constants.hpp" +#include "device/xc_device_task.hpp" +#include "device_specific/cuda_device_constants.hpp" +#include "device/common/shell_to_task.hpp" +#include + +namespace GauXC { + + +__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_cartesian_4( + uint32_t nshell, + ShellToTaskDevice* __restrict__ shell_to_task, + XCDeviceTask* __restrict__ device_tasks +) { + + + __shared__ double alpha[16][detail::shell_nprim_max + 1]; + __shared__ double coeff[16][detail::shell_nprim_max + 1]; + double* my_alpha = alpha[threadIdx.x/32]; + double* my_coeff = coeff[threadIdx.x/32]; + + for( auto ish = blockIdx.z; ish < nshell; ish += gridDim.z ) { + const uint32_t ntasks = shell_to_task[ish].ntask; + const auto shell = shell_to_task[ish].shell_device; + const auto task_idx = shell_to_task[ish].task_idx_device; + const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device; + + + // Load Shell Data into registers / SM + const uint32_t nprim = shell->nprim(); + const double3 O = *reinterpret_cast(shell->O_data()); + + const int global_warp_id = (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size; + const int nwarp_global = max((blockDim.x*gridDim.x) / cuda::warp_size,1); + + // Read in coeffs/exps into SM on first warp + { + auto* coeff_gm = shell->coeff_data(); + auto* alpha_gm = shell->alpha_data(); + static_assert( detail::shell_nprim_max == cuda::warp_size ); + const int warp_rank = threadIdx.x % cuda::warp_size; + my_alpha[warp_rank] = alpha_gm[warp_rank]; + my_coeff[warp_rank] = coeff_gm[warp_rank]; + } + + // Loop over tasks assigned to shells + // Place each task on a different warp + schedule across blocks + for( int itask = global_warp_id; itask < ntasks; itask += nwarp_global ) { + + const auto* task = device_tasks + task_idx[itask]; + const auto* __restrict__ points_x = task->points_x; + const auto* __restrict__ points_y = task->points_y; + const auto* __restrict__ points_z = task->points_z; + const uint32_t npts = task->npts; + const size_t shoff = task_shell_offs[itask] * npts; + + auto* __restrict__ basis_eval = task->bf + shoff; + + + // Loop over points in task + // Assign each point to separate thread within the warp + #pragma unroll 1 + for( int ipt = threadIdx.x % cuda::warp_size; ipt < npts; ipt += cuda::warp_size ) { + //const double3 point = points[ipt]; + double3 point; + point.x = points_x[ipt]; + point.y = points_y[ipt]; + point.z = points_z[ipt]; + + + const auto x = point.x - O.x; + const auto y = point.y - O.y; + const auto z = point.z - O.z; + const auto rsq = x*x + y*y + z*z; + + // Evaluate radial part of bfn + double radial_eval = 0.; + + #pragma unroll 1 + for( uint32_t i = 0; i < nprim; ++i ) { + const auto a = my_alpha[i]; + const auto e = my_coeff[i] * std::exp( - a * rsq ); + + radial_eval += e; + } + + + + + // Evaluate basis function + basis_eval[ipt + 0*npts] = radial_eval*x*x*x*x; + basis_eval[ipt + 1*npts] = radial_eval*x*x*x*y; + basis_eval[ipt + 2*npts] = radial_eval*x*x*x*z; + basis_eval[ipt + 3*npts] = radial_eval*x*x*y*y; + basis_eval[ipt + 4*npts] = radial_eval*x*x*y*z; + basis_eval[ipt + 5*npts] = radial_eval*x*x*z*z; + basis_eval[ipt + 6*npts] = radial_eval*x*y*y*y; + basis_eval[ipt + 7*npts] = radial_eval*x*y*y*z; + basis_eval[ipt + 8*npts] = radial_eval*x*y*z*z; + basis_eval[ipt + 9*npts] = radial_eval*x*z*z*z; + basis_eval[ipt + 10*npts] = radial_eval*y*y*y*y; + basis_eval[ipt + 11*npts] = radial_eval*y*y*y*z; + basis_eval[ipt + 12*npts] = radial_eval*y*y*z*z; + basis_eval[ipt + 13*npts] = radial_eval*y*z*z*z; + basis_eval[ipt + 14*npts] = radial_eval*z*z*z*z; + + + + + + + + +#if 0 + // Evaluate the angular part of bfn + + + + double ang_eval_0; + double ang_eval_1; + double ang_eval_2; + double ang_eval_3; + + + ang_eval_0 = radial_eval*x*x*x*x; + ang_eval_1 = radial_eval*x*x*x*y; + ang_eval_2 = radial_eval*x*x*x*z; + ang_eval_3 = radial_eval*x*x*y*y; + basis_eval[ipt + 0*npts] = ang_eval_0; + basis_eval[ipt + 1*npts] = ang_eval_1; + basis_eval[ipt + 2*npts] = ang_eval_2; + basis_eval[ipt + 3*npts] = ang_eval_3; + + ang_eval_0 = radial_eval*x*x*y*z; + ang_eval_1 = radial_eval*x*x*z*z; + ang_eval_2 = radial_eval*x*y*y*y; + ang_eval_3 = radial_eval*x*y*y*z; + basis_eval[ipt + 4*npts] = ang_eval_0; + basis_eval[ipt + 5*npts] = ang_eval_1; + basis_eval[ipt + 6*npts] = ang_eval_2; + basis_eval[ipt + 7*npts] = ang_eval_3; + + ang_eval_0 = radial_eval*x*y*z*z; + ang_eval_1 = radial_eval*x*z*z*z; + ang_eval_2 = radial_eval*y*y*y*y; + ang_eval_3 = radial_eval*y*y*y*z; + basis_eval[ipt + 8*npts] = ang_eval_0; + basis_eval[ipt + 9*npts] = ang_eval_1; + basis_eval[ipt + 10*npts] = ang_eval_2; + basis_eval[ipt + 11*npts] = ang_eval_3; + + ang_eval_0 = radial_eval*y*y*z*z; + ang_eval_1 = radial_eval*y*z*z*z; + ang_eval_2 = radial_eval*z*z*z*z; + basis_eval[ipt + 12*npts] = ang_eval_0; + basis_eval[ipt + 13*npts] = ang_eval_1; + basis_eval[ipt + 14*npts] = ang_eval_2; + + +#endif + } // Loop over points within task + } // Loop over tasks + + } // Loop over shells +} // end kernel + +} // namespace GauXC diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_gradient.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_gradient.hpp new file mode 100644 index 00000000..878c3f4f --- /dev/null +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_gradient.hpp @@ -0,0 +1,330 @@ +/** + * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * + * See LICENSE.txt for details + */ +#pragma once +#include "collocation_device_constants.hpp" +#include "device/xc_device_task.hpp" +#include "device_specific/cuda_device_constants.hpp" +#include "device/common/shell_to_task.hpp" +#include + +namespace GauXC { + + +__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_cartesian_gradient_4( + uint32_t nshell, + ShellToTaskDevice* __restrict__ shell_to_task, + XCDeviceTask* __restrict__ device_tasks +) { + + + __shared__ double alpha[16][detail::shell_nprim_max + 1]; + __shared__ double coeff[16][detail::shell_nprim_max + 1]; + double* my_alpha = alpha[threadIdx.x/32]; + double* my_coeff = coeff[threadIdx.x/32]; + + for( auto ish = blockIdx.z; ish < nshell; ish += gridDim.z ) { + const uint32_t ntasks = shell_to_task[ish].ntask; + const auto shell = shell_to_task[ish].shell_device; + const auto task_idx = shell_to_task[ish].task_idx_device; + const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device; + + + // Load Shell Data into registers / SM + const uint32_t nprim = shell->nprim(); + const double3 O = *reinterpret_cast(shell->O_data()); + + const int global_warp_id = (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size; + const int nwarp_global = max((blockDim.x*gridDim.x) / cuda::warp_size,1); + + // Read in coeffs/exps into SM on first warp + { + auto* coeff_gm = shell->coeff_data(); + auto* alpha_gm = shell->alpha_data(); + static_assert( detail::shell_nprim_max == cuda::warp_size ); + const int warp_rank = threadIdx.x % cuda::warp_size; + my_alpha[warp_rank] = alpha_gm[warp_rank]; + my_coeff[warp_rank] = coeff_gm[warp_rank]; + } + + // Loop over tasks assigned to shells + // Place each task on a different warp + schedule across blocks + for( int itask = global_warp_id; itask < ntasks; itask += nwarp_global ) { + + const auto* task = device_tasks + task_idx[itask]; + const auto* __restrict__ points_x = task->points_x; + const auto* __restrict__ points_y = task->points_y; + const auto* __restrict__ points_z = task->points_z; + const uint32_t npts = task->npts; + const size_t shoff = task_shell_offs[itask] * npts; + + auto* __restrict__ basis_eval = task->bf + shoff; + auto* __restrict__ basis_x_eval = task->dbfx + shoff; + auto* __restrict__ basis_y_eval = task->dbfy + shoff; + auto* __restrict__ basis_z_eval = task->dbfz + shoff; + + + // Loop over points in task + // Assign each point to separate thread within the warp + #pragma unroll 1 + for( int ipt = threadIdx.x % cuda::warp_size; ipt < npts; ipt += cuda::warp_size ) { + //const double3 point = points[ipt]; + double3 point; + point.x = points_x[ipt]; + point.y = points_y[ipt]; + point.z = points_z[ipt]; + + + const auto x = point.x - O.x; + const auto y = point.y - O.y; + const auto z = point.z - O.z; + const auto rsq = x*x + y*y + z*z; + + // Evaluate radial part of bfn + double radial_eval = 0.; + double radial_eval_alpha = 0.; + + #pragma unroll 1 + for( uint32_t i = 0; i < nprim; ++i ) { + const auto a = my_alpha[i]; + const auto e = my_coeff[i] * std::exp( - a * rsq ); + + radial_eval += e; + radial_eval_alpha += a * e; + } + + radial_eval_alpha *= -2; + + + + // Evaluate basis function + basis_eval[ipt + 0*npts] = radial_eval*x*x*x*x; + basis_eval[ipt + 1*npts] = radial_eval*x*x*x*y; + basis_eval[ipt + 2*npts] = radial_eval*x*x*x*z; + basis_eval[ipt + 3*npts] = radial_eval*x*x*y*y; + basis_eval[ipt + 4*npts] = radial_eval*x*x*y*z; + basis_eval[ipt + 5*npts] = radial_eval*x*x*z*z; + basis_eval[ipt + 6*npts] = radial_eval*x*y*y*y; + basis_eval[ipt + 7*npts] = radial_eval*x*y*y*z; + basis_eval[ipt + 8*npts] = radial_eval*x*y*z*z; + basis_eval[ipt + 9*npts] = radial_eval*x*z*z*z; + basis_eval[ipt + 10*npts] = radial_eval*y*y*y*y; + basis_eval[ipt + 11*npts] = radial_eval*y*y*y*z; + basis_eval[ipt + 12*npts] = radial_eval*y*y*z*z; + basis_eval[ipt + 13*npts] = radial_eval*y*z*z*z; + basis_eval[ipt + 14*npts] = radial_eval*z*z*z*z; + + + + // Evaluate first derivative of bfn wrt x + basis_x_eval[ipt + 0*npts] = x*x*x*(4*radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 1*npts] = x*x*y*(3*radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 2*npts] = x*x*z*(3*radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 3*npts] = x*y*y*(2*radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 4*npts] = x*y*z*(2*radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 5*npts] = x*z*z*(2*radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 6*npts] = y*y*y*(radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 7*npts] = y*y*z*(radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 8*npts] = y*z*z*(radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 9*npts] = z*z*z*(radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 10*npts] = radial_eval_alpha*x*y*y*y*y; + basis_x_eval[ipt + 11*npts] = radial_eval_alpha*x*y*y*y*z; + basis_x_eval[ipt + 12*npts] = radial_eval_alpha*x*y*y*z*z; + basis_x_eval[ipt + 13*npts] = radial_eval_alpha*x*y*z*z*z; + basis_x_eval[ipt + 14*npts] = radial_eval_alpha*x*z*z*z*z; + + // Evaluate first derivative of bfn wrt y + basis_y_eval[ipt + 0*npts] = radial_eval_alpha*x*x*x*x*y; + basis_y_eval[ipt + 1*npts] = x*x*x*(radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 2*npts] = radial_eval_alpha*x*x*x*y*z; + basis_y_eval[ipt + 3*npts] = x*x*y*(2*radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 4*npts] = x*x*z*(radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 5*npts] = radial_eval_alpha*x*x*y*z*z; + basis_y_eval[ipt + 6*npts] = x*y*y*(3*radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 7*npts] = x*y*z*(2*radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 8*npts] = x*z*z*(radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 9*npts] = radial_eval_alpha*x*y*z*z*z; + basis_y_eval[ipt + 10*npts] = y*y*y*(4*radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 11*npts] = y*y*z*(3*radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 12*npts] = y*z*z*(2*radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 13*npts] = z*z*z*(radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 14*npts] = radial_eval_alpha*y*z*z*z*z; + + // Evaluate first derivative of bfn wrt z + basis_z_eval[ipt + 0*npts] = radial_eval_alpha*x*x*x*x*z; + basis_z_eval[ipt + 1*npts] = radial_eval_alpha*x*x*x*y*z; + basis_z_eval[ipt + 2*npts] = x*x*x*(radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 3*npts] = radial_eval_alpha*x*x*y*y*z; + basis_z_eval[ipt + 4*npts] = x*x*y*(radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 5*npts] = x*x*z*(2*radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 6*npts] = radial_eval_alpha*x*y*y*y*z; + basis_z_eval[ipt + 7*npts] = x*y*y*(radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 8*npts] = x*y*z*(2*radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 9*npts] = x*z*z*(3*radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 10*npts] = radial_eval_alpha*y*y*y*y*z; + basis_z_eval[ipt + 11*npts] = y*y*y*(radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 12*npts] = y*y*z*(2*radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 13*npts] = y*z*z*(3*radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 14*npts] = z*z*z*(4*radial_eval + radial_eval_alpha*z*z); + + + + + +#if 0 + // Evaluate the angular part of bfn + + + + double ang_eval_0; + double ang_eval_1; + double ang_eval_2; + double ang_eval_3; + + + ang_eval_0 = radial_eval*x*x*x*x; + ang_eval_1 = radial_eval*x*x*x*y; + ang_eval_2 = radial_eval*x*x*x*z; + ang_eval_3 = radial_eval*x*x*y*y; + basis_eval[ipt + 0*npts] = ang_eval_0; + basis_eval[ipt + 1*npts] = ang_eval_1; + basis_eval[ipt + 2*npts] = ang_eval_2; + basis_eval[ipt + 3*npts] = ang_eval_3; + + ang_eval_0 = radial_eval*x*x*y*z; + ang_eval_1 = radial_eval*x*x*z*z; + ang_eval_2 = radial_eval*x*y*y*y; + ang_eval_3 = radial_eval*x*y*y*z; + basis_eval[ipt + 4*npts] = ang_eval_0; + basis_eval[ipt + 5*npts] = ang_eval_1; + basis_eval[ipt + 6*npts] = ang_eval_2; + basis_eval[ipt + 7*npts] = ang_eval_3; + + ang_eval_0 = radial_eval*x*y*z*z; + ang_eval_1 = radial_eval*x*z*z*z; + ang_eval_2 = radial_eval*y*y*y*y; + ang_eval_3 = radial_eval*y*y*y*z; + basis_eval[ipt + 8*npts] = ang_eval_0; + basis_eval[ipt + 9*npts] = ang_eval_1; + basis_eval[ipt + 10*npts] = ang_eval_2; + basis_eval[ipt + 11*npts] = ang_eval_3; + + ang_eval_0 = radial_eval*y*y*z*z; + ang_eval_1 = radial_eval*y*z*z*z; + ang_eval_2 = radial_eval*z*z*z*z; + basis_eval[ipt + 12*npts] = ang_eval_0; + basis_eval[ipt + 13*npts] = ang_eval_1; + basis_eval[ipt + 14*npts] = ang_eval_2; + + + double dang_eval_x_0, dang_eval_y_0, dang_eval_z_0; + double dang_eval_x_1, dang_eval_y_1, dang_eval_z_1; + double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; + double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3; + + dang_eval_x_0 = x*x*x*(4*radial_eval + radial_eval_alpha*x*x); + dang_eval_y_0 = radial_eval_alpha*x*x*x*x*y; + dang_eval_z_0 = radial_eval_alpha*x*x*x*x*z; + dang_eval_x_1 = x*x*y*(3*radial_eval + radial_eval_alpha*x*x); + dang_eval_y_1 = x*x*x*(radial_eval + radial_eval_alpha*y*y); + dang_eval_z_1 = radial_eval_alpha*x*x*x*y*z; + dang_eval_x_2 = x*x*z*(3*radial_eval + radial_eval_alpha*x*x); + dang_eval_y_2 = radial_eval_alpha*x*x*x*y*z; + dang_eval_z_2 = x*x*x*(radial_eval + radial_eval_alpha*z*z); + dang_eval_x_3 = x*y*y*(2*radial_eval + radial_eval_alpha*x*x); + dang_eval_y_3 = x*x*y*(2*radial_eval + radial_eval_alpha*y*y); + dang_eval_z_3 = radial_eval_alpha*x*x*y*y*z; + basis_x_eval[ipt + 0*npts] = dang_eval_x_0; + basis_y_eval[ipt + 0*npts] = dang_eval_y_0; + basis_z_eval[ipt + 0*npts] = dang_eval_z_0; + basis_x_eval[ipt + 1*npts] = dang_eval_x_1; + basis_y_eval[ipt + 1*npts] = dang_eval_y_1; + basis_z_eval[ipt + 1*npts] = dang_eval_z_1; + basis_x_eval[ipt + 2*npts] = dang_eval_x_2; + basis_y_eval[ipt + 2*npts] = dang_eval_y_2; + basis_z_eval[ipt + 2*npts] = dang_eval_z_2; + basis_x_eval[ipt + 3*npts] = dang_eval_x_3; + basis_y_eval[ipt + 3*npts] = dang_eval_y_3; + basis_z_eval[ipt + 3*npts] = dang_eval_z_3; + + dang_eval_x_0 = x*y*z*(2*radial_eval + radial_eval_alpha*x*x); + dang_eval_y_0 = x*x*z*(radial_eval + radial_eval_alpha*y*y); + dang_eval_z_0 = x*x*y*(radial_eval + radial_eval_alpha*z*z); + dang_eval_x_1 = x*z*z*(2*radial_eval + radial_eval_alpha*x*x); + dang_eval_y_1 = radial_eval_alpha*x*x*y*z*z; + dang_eval_z_1 = x*x*z*(2*radial_eval + radial_eval_alpha*z*z); + dang_eval_x_2 = y*y*y*(radial_eval + radial_eval_alpha*x*x); + dang_eval_y_2 = x*y*y*(3*radial_eval + radial_eval_alpha*y*y); + dang_eval_z_2 = radial_eval_alpha*x*y*y*y*z; + dang_eval_x_3 = y*y*z*(radial_eval + radial_eval_alpha*x*x); + dang_eval_y_3 = x*y*z*(2*radial_eval + radial_eval_alpha*y*y); + dang_eval_z_3 = x*y*y*(radial_eval + radial_eval_alpha*z*z); + basis_x_eval[ipt + 4*npts] = dang_eval_x_0; + basis_y_eval[ipt + 4*npts] = dang_eval_y_0; + basis_z_eval[ipt + 4*npts] = dang_eval_z_0; + basis_x_eval[ipt + 5*npts] = dang_eval_x_1; + basis_y_eval[ipt + 5*npts] = dang_eval_y_1; + basis_z_eval[ipt + 5*npts] = dang_eval_z_1; + basis_x_eval[ipt + 6*npts] = dang_eval_x_2; + basis_y_eval[ipt + 6*npts] = dang_eval_y_2; + basis_z_eval[ipt + 6*npts] = dang_eval_z_2; + basis_x_eval[ipt + 7*npts] = dang_eval_x_3; + basis_y_eval[ipt + 7*npts] = dang_eval_y_3; + basis_z_eval[ipt + 7*npts] = dang_eval_z_3; + + dang_eval_x_0 = y*z*z*(radial_eval + radial_eval_alpha*x*x); + dang_eval_y_0 = x*z*z*(radial_eval + radial_eval_alpha*y*y); + dang_eval_z_0 = x*y*z*(2*radial_eval + radial_eval_alpha*z*z); + dang_eval_x_1 = z*z*z*(radial_eval + radial_eval_alpha*x*x); + dang_eval_y_1 = radial_eval_alpha*x*y*z*z*z; + dang_eval_z_1 = x*z*z*(3*radial_eval + radial_eval_alpha*z*z); + dang_eval_x_2 = radial_eval_alpha*x*y*y*y*y; + dang_eval_y_2 = y*y*y*(4*radial_eval + radial_eval_alpha*y*y); + dang_eval_z_2 = radial_eval_alpha*y*y*y*y*z; + dang_eval_x_3 = radial_eval_alpha*x*y*y*y*z; + dang_eval_y_3 = y*y*z*(3*radial_eval + radial_eval_alpha*y*y); + dang_eval_z_3 = y*y*y*(radial_eval + radial_eval_alpha*z*z); + basis_x_eval[ipt + 8*npts] = dang_eval_x_0; + basis_y_eval[ipt + 8*npts] = dang_eval_y_0; + basis_z_eval[ipt + 8*npts] = dang_eval_z_0; + basis_x_eval[ipt + 9*npts] = dang_eval_x_1; + basis_y_eval[ipt + 9*npts] = dang_eval_y_1; + basis_z_eval[ipt + 9*npts] = dang_eval_z_1; + basis_x_eval[ipt + 10*npts] = dang_eval_x_2; + basis_y_eval[ipt + 10*npts] = dang_eval_y_2; + basis_z_eval[ipt + 10*npts] = dang_eval_z_2; + basis_x_eval[ipt + 11*npts] = dang_eval_x_3; + basis_y_eval[ipt + 11*npts] = dang_eval_y_3; + basis_z_eval[ipt + 11*npts] = dang_eval_z_3; + + dang_eval_x_0 = radial_eval_alpha*x*y*y*z*z; + dang_eval_y_0 = y*z*z*(2*radial_eval + radial_eval_alpha*y*y); + dang_eval_z_0 = y*y*z*(2*radial_eval + radial_eval_alpha*z*z); + dang_eval_x_1 = radial_eval_alpha*x*y*z*z*z; + dang_eval_y_1 = z*z*z*(radial_eval + radial_eval_alpha*y*y); + dang_eval_z_1 = y*z*z*(3*radial_eval + radial_eval_alpha*z*z); + dang_eval_x_2 = radial_eval_alpha*x*z*z*z*z; + dang_eval_y_2 = radial_eval_alpha*y*z*z*z*z; + dang_eval_z_2 = z*z*z*(4*radial_eval + radial_eval_alpha*z*z); + basis_x_eval[ipt + 12*npts] = dang_eval_x_0; + basis_y_eval[ipt + 12*npts] = dang_eval_y_0; + basis_z_eval[ipt + 12*npts] = dang_eval_z_0; + basis_x_eval[ipt + 13*npts] = dang_eval_x_1; + basis_y_eval[ipt + 13*npts] = dang_eval_y_1; + basis_z_eval[ipt + 13*npts] = dang_eval_z_1; + basis_x_eval[ipt + 14*npts] = dang_eval_x_2; + basis_y_eval[ipt + 14*npts] = dang_eval_y_2; + basis_z_eval[ipt + 14*npts] = dang_eval_z_2; + +#endif + } // Loop over points within task + } // Loop over tasks + + } // Loop over shells +} // end kernel + +} // namespace GauXC diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_hessian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_hessian.hpp new file mode 100644 index 00000000..5e5ac1c0 --- /dev/null +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_hessian.hpp @@ -0,0 +1,440 @@ +/** + * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * + * See LICENSE.txt for details + */ +#pragma once +#include "collocation_device_constants.hpp" +#include "device/xc_device_task.hpp" +#include "device_specific/cuda_device_constants.hpp" +#include "device/common/shell_to_task.hpp" +#include + +namespace GauXC { + + +__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_cartesian_hessian_4( + uint32_t nshell, + ShellToTaskDevice* __restrict__ shell_to_task, + XCDeviceTask* __restrict__ device_tasks +) { + + + __shared__ double alpha[16][detail::shell_nprim_max + 1]; + __shared__ double coeff[16][detail::shell_nprim_max + 1]; + double* my_alpha = alpha[threadIdx.x/32]; + double* my_coeff = coeff[threadIdx.x/32]; + + for( auto ish = blockIdx.z; ish < nshell; ish += gridDim.z ) { + const uint32_t ntasks = shell_to_task[ish].ntask; + const auto shell = shell_to_task[ish].shell_device; + const auto task_idx = shell_to_task[ish].task_idx_device; + const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device; + + + // Load Shell Data into registers / SM + const uint32_t nprim = shell->nprim(); + const double3 O = *reinterpret_cast(shell->O_data()); + + const int global_warp_id = (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size; + const int nwarp_global = max((blockDim.x*gridDim.x) / cuda::warp_size,1); + + // Read in coeffs/exps into SM on first warp + { + auto* coeff_gm = shell->coeff_data(); + auto* alpha_gm = shell->alpha_data(); + static_assert( detail::shell_nprim_max == cuda::warp_size ); + const int warp_rank = threadIdx.x % cuda::warp_size; + my_alpha[warp_rank] = alpha_gm[warp_rank]; + my_coeff[warp_rank] = coeff_gm[warp_rank]; + } + + // Loop over tasks assigned to shells + // Place each task on a different warp + schedule across blocks + for( int itask = global_warp_id; itask < ntasks; itask += nwarp_global ) { + + const auto* task = device_tasks + task_idx[itask]; + const auto* __restrict__ points_x = task->points_x; + const auto* __restrict__ points_y = task->points_y; + const auto* __restrict__ points_z = task->points_z; + const uint32_t npts = task->npts; + const size_t shoff = task_shell_offs[itask] * npts; + + auto* __restrict__ basis_eval = task->bf + shoff; + auto* __restrict__ basis_x_eval = task->dbfx + shoff; + auto* __restrict__ basis_y_eval = task->dbfy + shoff; + auto* __restrict__ basis_z_eval = task->dbfz + shoff; + + auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff; + auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff; + auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff; + auto* __restrict__ basis_yy_eval = task->d2bfyy + shoff; + auto* __restrict__ basis_yz_eval = task->d2bfyz + shoff; + auto* __restrict__ basis_zz_eval = task->d2bfzz + shoff; + + // Loop over points in task + // Assign each point to separate thread within the warp + #pragma unroll 1 + for( int ipt = threadIdx.x % cuda::warp_size; ipt < npts; ipt += cuda::warp_size ) { + //const double3 point = points[ipt]; + double3 point; + point.x = points_x[ipt]; + point.y = points_y[ipt]; + point.z = points_z[ipt]; + + + const auto x = point.x - O.x; + const auto y = point.y - O.y; + const auto z = point.z - O.z; + const auto rsq = x*x + y*y + z*z; + + // Evaluate radial part of bfn + double radial_eval = 0.; + double radial_eval_alpha = 0.; + double radial_eval_alpha_squared = 0.; + + #pragma unroll 1 + for( uint32_t i = 0; i < nprim; ++i ) { + const auto a = my_alpha[i]; + const auto e = my_coeff[i] * std::exp( - a * rsq ); + + radial_eval += e; + radial_eval_alpha += a * e; + radial_eval_alpha_squared += a * a * e; + } + + radial_eval_alpha *= -2; + radial_eval_alpha_squared *= 4; + + + + // Evaluate basis function + basis_eval[ipt + 0*npts] = radial_eval*x*x*x*x; + basis_eval[ipt + 1*npts] = radial_eval*x*x*x*y; + basis_eval[ipt + 2*npts] = radial_eval*x*x*x*z; + basis_eval[ipt + 3*npts] = radial_eval*x*x*y*y; + basis_eval[ipt + 4*npts] = radial_eval*x*x*y*z; + basis_eval[ipt + 5*npts] = radial_eval*x*x*z*z; + basis_eval[ipt + 6*npts] = radial_eval*x*y*y*y; + basis_eval[ipt + 7*npts] = radial_eval*x*y*y*z; + basis_eval[ipt + 8*npts] = radial_eval*x*y*z*z; + basis_eval[ipt + 9*npts] = radial_eval*x*z*z*z; + basis_eval[ipt + 10*npts] = radial_eval*y*y*y*y; + basis_eval[ipt + 11*npts] = radial_eval*y*y*y*z; + basis_eval[ipt + 12*npts] = radial_eval*y*y*z*z; + basis_eval[ipt + 13*npts] = radial_eval*y*z*z*z; + basis_eval[ipt + 14*npts] = radial_eval*z*z*z*z; + + + + // Evaluate first derivative of bfn wrt x + basis_x_eval[ipt + 0*npts] = x*x*x*(4*radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 1*npts] = x*x*y*(3*radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 2*npts] = x*x*z*(3*radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 3*npts] = x*y*y*(2*radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 4*npts] = x*y*z*(2*radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 5*npts] = x*z*z*(2*radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 6*npts] = y*y*y*(radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 7*npts] = y*y*z*(radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 8*npts] = y*z*z*(radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 9*npts] = z*z*z*(radial_eval + radial_eval_alpha*x*x); + basis_x_eval[ipt + 10*npts] = radial_eval_alpha*x*y*y*y*y; + basis_x_eval[ipt + 11*npts] = radial_eval_alpha*x*y*y*y*z; + basis_x_eval[ipt + 12*npts] = radial_eval_alpha*x*y*y*z*z; + basis_x_eval[ipt + 13*npts] = radial_eval_alpha*x*y*z*z*z; + basis_x_eval[ipt + 14*npts] = radial_eval_alpha*x*z*z*z*z; + + // Evaluate first derivative of bfn wrt y + basis_y_eval[ipt + 0*npts] = radial_eval_alpha*x*x*x*x*y; + basis_y_eval[ipt + 1*npts] = x*x*x*(radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 2*npts] = radial_eval_alpha*x*x*x*y*z; + basis_y_eval[ipt + 3*npts] = x*x*y*(2*radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 4*npts] = x*x*z*(radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 5*npts] = radial_eval_alpha*x*x*y*z*z; + basis_y_eval[ipt + 6*npts] = x*y*y*(3*radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 7*npts] = x*y*z*(2*radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 8*npts] = x*z*z*(radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 9*npts] = radial_eval_alpha*x*y*z*z*z; + basis_y_eval[ipt + 10*npts] = y*y*y*(4*radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 11*npts] = y*y*z*(3*radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 12*npts] = y*z*z*(2*radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 13*npts] = z*z*z*(radial_eval + radial_eval_alpha*y*y); + basis_y_eval[ipt + 14*npts] = radial_eval_alpha*y*z*z*z*z; + + // Evaluate first derivative of bfn wrt z + basis_z_eval[ipt + 0*npts] = radial_eval_alpha*x*x*x*x*z; + basis_z_eval[ipt + 1*npts] = radial_eval_alpha*x*x*x*y*z; + basis_z_eval[ipt + 2*npts] = x*x*x*(radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 3*npts] = radial_eval_alpha*x*x*y*y*z; + basis_z_eval[ipt + 4*npts] = x*x*y*(radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 5*npts] = x*x*z*(2*radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 6*npts] = radial_eval_alpha*x*y*y*y*z; + basis_z_eval[ipt + 7*npts] = x*y*y*(radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 8*npts] = x*y*z*(2*radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 9*npts] = x*z*z*(3*radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 10*npts] = radial_eval_alpha*y*y*y*y*z; + basis_z_eval[ipt + 11*npts] = y*y*y*(radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 12*npts] = y*y*z*(2*radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 13*npts] = y*z*z*(3*radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 14*npts] = z*z*z*(4*radial_eval + radial_eval_alpha*z*z); + + // Evaluate second derivative of bfn wrt xx + basis_xx_eval[ipt + 0*npts] = x*x*(12*radial_eval + 9*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x); + basis_xx_eval[ipt + 1*npts] = x*y*(6*radial_eval + 7*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x); + basis_xx_eval[ipt + 2*npts] = x*z*(6*radial_eval + 7*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x); + basis_xx_eval[ipt + 3*npts] = y*y*(2*radial_eval + 5*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x); + basis_xx_eval[ipt + 4*npts] = y*z*(2*radial_eval + 5*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x); + basis_xx_eval[ipt + 5*npts] = z*z*(2*radial_eval + 5*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x); + basis_xx_eval[ipt + 6*npts] = x*y*y*y*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xx_eval[ipt + 7*npts] = x*y*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xx_eval[ipt + 8*npts] = x*y*z*z*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xx_eval[ipt + 9*npts] = x*z*z*z*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xx_eval[ipt + 10*npts] = y*y*y*y*(radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xx_eval[ipt + 11*npts] = y*y*y*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xx_eval[ipt + 12*npts] = y*y*z*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xx_eval[ipt + 13*npts] = y*z*z*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xx_eval[ipt + 14*npts] = z*z*z*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x); + + // Evaluate second derivative of bfn wrt xy + basis_xy_eval[ipt + 0*npts] = x*x*x*y*(4*radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xy_eval[ipt + 1*npts] = x*x*(3*radial_eval + radial_eval_alpha*x*x + 3*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y); + basis_xy_eval[ipt + 2*npts] = x*x*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xy_eval[ipt + 3*npts] = x*y*(4*radial_eval + 2*radial_eval_alpha*x*x + 2*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y); + basis_xy_eval[ipt + 4*npts] = x*z*(2*radial_eval + radial_eval_alpha*x*x + 2*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y); + basis_xy_eval[ipt + 5*npts] = x*y*z*z*(2*radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xy_eval[ipt + 6*npts] = y*y*(3*radial_eval + 3*radial_eval_alpha*x*x + radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y); + basis_xy_eval[ipt + 7*npts] = y*z*(2*radial_eval + 2*radial_eval_alpha*x*x + radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y); + basis_xy_eval[ipt + 8*npts] = z*z*(radial_eval + radial_eval_alpha*x*x + radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y); + basis_xy_eval[ipt + 9*npts] = y*z*z*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xy_eval[ipt + 10*npts] = x*y*y*y*(4*radial_eval_alpha + radial_eval_alpha_squared*y*y); + basis_xy_eval[ipt + 11*npts] = x*y*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y); + basis_xy_eval[ipt + 12*npts] = x*y*z*z*(2*radial_eval_alpha + radial_eval_alpha_squared*y*y); + basis_xy_eval[ipt + 13*npts] = x*z*z*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y); + basis_xy_eval[ipt + 14*npts] = radial_eval_alpha_squared*x*y*z*z*z*z; + + // Evaluate second derivative of bfn wrt xz + basis_xz_eval[ipt + 0*npts] = x*x*x*z*(4*radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xz_eval[ipt + 1*npts] = x*x*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xz_eval[ipt + 2*npts] = x*x*(3*radial_eval + radial_eval_alpha*x*x + 3*radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z); + basis_xz_eval[ipt + 3*npts] = x*y*y*z*(2*radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xz_eval[ipt + 4*npts] = x*y*(2*radial_eval + radial_eval_alpha*x*x + 2*radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z); + basis_xz_eval[ipt + 5*npts] = x*z*(4*radial_eval + 2*radial_eval_alpha*x*x + 2*radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z); + basis_xz_eval[ipt + 6*npts] = y*y*y*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xz_eval[ipt + 7*npts] = y*y*(radial_eval + radial_eval_alpha*x*x + radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z); + basis_xz_eval[ipt + 8*npts] = y*z*(2*radial_eval + 2*radial_eval_alpha*x*x + radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z); + basis_xz_eval[ipt + 9*npts] = z*z*(3*radial_eval + 3*radial_eval_alpha*x*x + radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z); + basis_xz_eval[ipt + 10*npts] = radial_eval_alpha_squared*x*y*y*y*y*z; + basis_xz_eval[ipt + 11*npts] = x*y*y*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_xz_eval[ipt + 12*npts] = x*y*y*z*(2*radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_xz_eval[ipt + 13*npts] = x*y*z*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_xz_eval[ipt + 14*npts] = x*z*z*z*(4*radial_eval_alpha + radial_eval_alpha_squared*z*z); + + // Evaluate second derivative of bfn wrt yy + basis_yy_eval[ipt + 0*npts] = x*x*x*x*(radial_eval_alpha + radial_eval_alpha_squared*y*y); + basis_yy_eval[ipt + 1*npts] = x*x*x*y*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y); + basis_yy_eval[ipt + 2*npts] = x*x*x*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y); + basis_yy_eval[ipt + 3*npts] = x*x*(2*radial_eval + 5*radial_eval_alpha*y*y + radial_eval_alpha_squared*y*y*y*y); + basis_yy_eval[ipt + 4*npts] = x*x*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y); + basis_yy_eval[ipt + 5*npts] = x*x*z*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y); + basis_yy_eval[ipt + 6*npts] = x*y*(6*radial_eval + 7*radial_eval_alpha*y*y + radial_eval_alpha_squared*y*y*y*y); + basis_yy_eval[ipt + 7*npts] = x*z*(2*radial_eval + 5*radial_eval_alpha*y*y + radial_eval_alpha_squared*y*y*y*y); + basis_yy_eval[ipt + 8*npts] = x*y*z*z*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y); + basis_yy_eval[ipt + 9*npts] = x*z*z*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y); + basis_yy_eval[ipt + 10*npts] = y*y*(12*radial_eval + 9*radial_eval_alpha*y*y + radial_eval_alpha_squared*y*y*y*y); + basis_yy_eval[ipt + 11*npts] = y*z*(6*radial_eval + 7*radial_eval_alpha*y*y + radial_eval_alpha_squared*y*y*y*y); + basis_yy_eval[ipt + 12*npts] = z*z*(2*radial_eval + 5*radial_eval_alpha*y*y + radial_eval_alpha_squared*y*y*y*y); + basis_yy_eval[ipt + 13*npts] = y*z*z*z*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y); + basis_yy_eval[ipt + 14*npts] = z*z*z*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y); + + // Evaluate second derivative of bfn wrt yz + basis_yz_eval[ipt + 0*npts] = radial_eval_alpha_squared*x*x*x*x*y*z; + basis_yz_eval[ipt + 1*npts] = x*x*x*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y); + basis_yz_eval[ipt + 2*npts] = x*x*x*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_yz_eval[ipt + 3*npts] = x*x*y*z*(2*radial_eval_alpha + radial_eval_alpha_squared*y*y); + basis_yz_eval[ipt + 4*npts] = x*x*(radial_eval + radial_eval_alpha*y*y + radial_eval_alpha*z*z + radial_eval_alpha_squared*y*y*z*z); + basis_yz_eval[ipt + 5*npts] = x*x*y*z*(2*radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_yz_eval[ipt + 6*npts] = x*y*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y); + basis_yz_eval[ipt + 7*npts] = x*y*(2*radial_eval + radial_eval_alpha*y*y + 2*radial_eval_alpha*z*z + radial_eval_alpha_squared*y*y*z*z); + basis_yz_eval[ipt + 8*npts] = x*z*(2*radial_eval + 2*radial_eval_alpha*y*y + radial_eval_alpha*z*z + radial_eval_alpha_squared*y*y*z*z); + basis_yz_eval[ipt + 9*npts] = x*y*z*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_yz_eval[ipt + 10*npts] = y*y*y*z*(4*radial_eval_alpha + radial_eval_alpha_squared*y*y); + basis_yz_eval[ipt + 11*npts] = y*y*(3*radial_eval + radial_eval_alpha*y*y + 3*radial_eval_alpha*z*z + radial_eval_alpha_squared*y*y*z*z); + basis_yz_eval[ipt + 12*npts] = y*z*(4*radial_eval + 2*radial_eval_alpha*y*y + 2*radial_eval_alpha*z*z + radial_eval_alpha_squared*y*y*z*z); + basis_yz_eval[ipt + 13*npts] = z*z*(3*radial_eval + 3*radial_eval_alpha*y*y + radial_eval_alpha*z*z + radial_eval_alpha_squared*y*y*z*z); + basis_yz_eval[ipt + 14*npts] = y*z*z*z*(4*radial_eval_alpha + radial_eval_alpha_squared*z*z); + + // Evaluate second derivative of bfn wrt zz + basis_zz_eval[ipt + 0*npts] = x*x*x*x*(radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_zz_eval[ipt + 1*npts] = x*x*x*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_zz_eval[ipt + 2*npts] = x*x*x*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_zz_eval[ipt + 3*npts] = x*x*y*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_zz_eval[ipt + 4*npts] = x*x*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_zz_eval[ipt + 5*npts] = x*x*(2*radial_eval + 5*radial_eval_alpha*z*z + radial_eval_alpha_squared*z*z*z*z); + basis_zz_eval[ipt + 6*npts] = x*y*y*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_zz_eval[ipt + 7*npts] = x*y*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_zz_eval[ipt + 8*npts] = x*y*(2*radial_eval + 5*radial_eval_alpha*z*z + radial_eval_alpha_squared*z*z*z*z); + basis_zz_eval[ipt + 9*npts] = x*z*(6*radial_eval + 7*radial_eval_alpha*z*z + radial_eval_alpha_squared*z*z*z*z); + basis_zz_eval[ipt + 10*npts] = y*y*y*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_zz_eval[ipt + 11*npts] = y*y*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_zz_eval[ipt + 12*npts] = y*y*(2*radial_eval + 5*radial_eval_alpha*z*z + radial_eval_alpha_squared*z*z*z*z); + basis_zz_eval[ipt + 13*npts] = y*z*(6*radial_eval + 7*radial_eval_alpha*z*z + radial_eval_alpha_squared*z*z*z*z); + basis_zz_eval[ipt + 14*npts] = z*z*(12*radial_eval + 9*radial_eval_alpha*z*z + radial_eval_alpha_squared*z*z*z*z); + + + + +#if 0 + // Evaluate the angular part of bfn + + + + double ang_eval_0; + double ang_eval_1; + double ang_eval_2; + double ang_eval_3; + + + ang_eval_0 = radial_eval*x*x*x*x; + ang_eval_1 = radial_eval*x*x*x*y; + ang_eval_2 = radial_eval*x*x*x*z; + ang_eval_3 = radial_eval*x*x*y*y; + basis_eval[ipt + 0*npts] = ang_eval_0; + basis_eval[ipt + 1*npts] = ang_eval_1; + basis_eval[ipt + 2*npts] = ang_eval_2; + basis_eval[ipt + 3*npts] = ang_eval_3; + + ang_eval_0 = radial_eval*x*x*y*z; + ang_eval_1 = radial_eval*x*x*z*z; + ang_eval_2 = radial_eval*x*y*y*y; + ang_eval_3 = radial_eval*x*y*y*z; + basis_eval[ipt + 4*npts] = ang_eval_0; + basis_eval[ipt + 5*npts] = ang_eval_1; + basis_eval[ipt + 6*npts] = ang_eval_2; + basis_eval[ipt + 7*npts] = ang_eval_3; + + ang_eval_0 = radial_eval*x*y*z*z; + ang_eval_1 = radial_eval*x*z*z*z; + ang_eval_2 = radial_eval*y*y*y*y; + ang_eval_3 = radial_eval*y*y*y*z; + basis_eval[ipt + 8*npts] = ang_eval_0; + basis_eval[ipt + 9*npts] = ang_eval_1; + basis_eval[ipt + 10*npts] = ang_eval_2; + basis_eval[ipt + 11*npts] = ang_eval_3; + + ang_eval_0 = radial_eval*y*y*z*z; + ang_eval_1 = radial_eval*y*z*z*z; + ang_eval_2 = radial_eval*z*z*z*z; + basis_eval[ipt + 12*npts] = ang_eval_0; + basis_eval[ipt + 13*npts] = ang_eval_1; + basis_eval[ipt + 14*npts] = ang_eval_2; + + + double dang_eval_x_0, dang_eval_y_0, dang_eval_z_0; + double dang_eval_x_1, dang_eval_y_1, dang_eval_z_1; + double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; + double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3; + + dang_eval_x_0 = x*x*x*(4*radial_eval + radial_eval_alpha*x*x); + dang_eval_y_0 = radial_eval_alpha*x*x*x*x*y; + dang_eval_z_0 = radial_eval_alpha*x*x*x*x*z; + dang_eval_x_1 = x*x*y*(3*radial_eval + radial_eval_alpha*x*x); + dang_eval_y_1 = x*x*x*(radial_eval + radial_eval_alpha*y*y); + dang_eval_z_1 = radial_eval_alpha*x*x*x*y*z; + dang_eval_x_2 = x*x*z*(3*radial_eval + radial_eval_alpha*x*x); + dang_eval_y_2 = radial_eval_alpha*x*x*x*y*z; + dang_eval_z_2 = x*x*x*(radial_eval + radial_eval_alpha*z*z); + dang_eval_x_3 = x*y*y*(2*radial_eval + radial_eval_alpha*x*x); + dang_eval_y_3 = x*x*y*(2*radial_eval + radial_eval_alpha*y*y); + dang_eval_z_3 = radial_eval_alpha*x*x*y*y*z; + basis_x_eval[ipt + 0*npts] = dang_eval_x_0; + basis_y_eval[ipt + 0*npts] = dang_eval_y_0; + basis_z_eval[ipt + 0*npts] = dang_eval_z_0; + basis_x_eval[ipt + 1*npts] = dang_eval_x_1; + basis_y_eval[ipt + 1*npts] = dang_eval_y_1; + basis_z_eval[ipt + 1*npts] = dang_eval_z_1; + basis_x_eval[ipt + 2*npts] = dang_eval_x_2; + basis_y_eval[ipt + 2*npts] = dang_eval_y_2; + basis_z_eval[ipt + 2*npts] = dang_eval_z_2; + basis_x_eval[ipt + 3*npts] = dang_eval_x_3; + basis_y_eval[ipt + 3*npts] = dang_eval_y_3; + basis_z_eval[ipt + 3*npts] = dang_eval_z_3; + + dang_eval_x_0 = x*y*z*(2*radial_eval + radial_eval_alpha*x*x); + dang_eval_y_0 = x*x*z*(radial_eval + radial_eval_alpha*y*y); + dang_eval_z_0 = x*x*y*(radial_eval + radial_eval_alpha*z*z); + dang_eval_x_1 = x*z*z*(2*radial_eval + radial_eval_alpha*x*x); + dang_eval_y_1 = radial_eval_alpha*x*x*y*z*z; + dang_eval_z_1 = x*x*z*(2*radial_eval + radial_eval_alpha*z*z); + dang_eval_x_2 = y*y*y*(radial_eval + radial_eval_alpha*x*x); + dang_eval_y_2 = x*y*y*(3*radial_eval + radial_eval_alpha*y*y); + dang_eval_z_2 = radial_eval_alpha*x*y*y*y*z; + dang_eval_x_3 = y*y*z*(radial_eval + radial_eval_alpha*x*x); + dang_eval_y_3 = x*y*z*(2*radial_eval + radial_eval_alpha*y*y); + dang_eval_z_3 = x*y*y*(radial_eval + radial_eval_alpha*z*z); + basis_x_eval[ipt + 4*npts] = dang_eval_x_0; + basis_y_eval[ipt + 4*npts] = dang_eval_y_0; + basis_z_eval[ipt + 4*npts] = dang_eval_z_0; + basis_x_eval[ipt + 5*npts] = dang_eval_x_1; + basis_y_eval[ipt + 5*npts] = dang_eval_y_1; + basis_z_eval[ipt + 5*npts] = dang_eval_z_1; + basis_x_eval[ipt + 6*npts] = dang_eval_x_2; + basis_y_eval[ipt + 6*npts] = dang_eval_y_2; + basis_z_eval[ipt + 6*npts] = dang_eval_z_2; + basis_x_eval[ipt + 7*npts] = dang_eval_x_3; + basis_y_eval[ipt + 7*npts] = dang_eval_y_3; + basis_z_eval[ipt + 7*npts] = dang_eval_z_3; + + dang_eval_x_0 = y*z*z*(radial_eval + radial_eval_alpha*x*x); + dang_eval_y_0 = x*z*z*(radial_eval + radial_eval_alpha*y*y); + dang_eval_z_0 = x*y*z*(2*radial_eval + radial_eval_alpha*z*z); + dang_eval_x_1 = z*z*z*(radial_eval + radial_eval_alpha*x*x); + dang_eval_y_1 = radial_eval_alpha*x*y*z*z*z; + dang_eval_z_1 = x*z*z*(3*radial_eval + radial_eval_alpha*z*z); + dang_eval_x_2 = radial_eval_alpha*x*y*y*y*y; + dang_eval_y_2 = y*y*y*(4*radial_eval + radial_eval_alpha*y*y); + dang_eval_z_2 = radial_eval_alpha*y*y*y*y*z; + dang_eval_x_3 = radial_eval_alpha*x*y*y*y*z; + dang_eval_y_3 = y*y*z*(3*radial_eval + radial_eval_alpha*y*y); + dang_eval_z_3 = y*y*y*(radial_eval + radial_eval_alpha*z*z); + basis_x_eval[ipt + 8*npts] = dang_eval_x_0; + basis_y_eval[ipt + 8*npts] = dang_eval_y_0; + basis_z_eval[ipt + 8*npts] = dang_eval_z_0; + basis_x_eval[ipt + 9*npts] = dang_eval_x_1; + basis_y_eval[ipt + 9*npts] = dang_eval_y_1; + basis_z_eval[ipt + 9*npts] = dang_eval_z_1; + basis_x_eval[ipt + 10*npts] = dang_eval_x_2; + basis_y_eval[ipt + 10*npts] = dang_eval_y_2; + basis_z_eval[ipt + 10*npts] = dang_eval_z_2; + basis_x_eval[ipt + 11*npts] = dang_eval_x_3; + basis_y_eval[ipt + 11*npts] = dang_eval_y_3; + basis_z_eval[ipt + 11*npts] = dang_eval_z_3; + + dang_eval_x_0 = radial_eval_alpha*x*y*y*z*z; + dang_eval_y_0 = y*z*z*(2*radial_eval + radial_eval_alpha*y*y); + dang_eval_z_0 = y*y*z*(2*radial_eval + radial_eval_alpha*z*z); + dang_eval_x_1 = radial_eval_alpha*x*y*z*z*z; + dang_eval_y_1 = z*z*z*(radial_eval + radial_eval_alpha*y*y); + dang_eval_z_1 = y*z*z*(3*radial_eval + radial_eval_alpha*z*z); + dang_eval_x_2 = radial_eval_alpha*x*z*z*z*z; + dang_eval_y_2 = radial_eval_alpha*y*z*z*z*z; + dang_eval_z_2 = z*z*z*(4*radial_eval + radial_eval_alpha*z*z); + basis_x_eval[ipt + 12*npts] = dang_eval_x_0; + basis_y_eval[ipt + 12*npts] = dang_eval_y_0; + basis_z_eval[ipt + 12*npts] = dang_eval_z_0; + basis_x_eval[ipt + 13*npts] = dang_eval_x_1; + basis_y_eval[ipt + 13*npts] = dang_eval_y_1; + basis_z_eval[ipt + 13*npts] = dang_eval_z_1; + basis_x_eval[ipt + 14*npts] = dang_eval_x_2; + basis_y_eval[ipt + 14*npts] = dang_eval_y_2; + basis_z_eval[ipt + 14*npts] = dang_eval_z_2; + +#endif + } // Loop over points within task + } // Loop over tasks + + } // Loop over shells +} // end kernel + +} // namespace GauXC diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_gradient.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_gradient.hpp index 50e904f0..320d1f2a 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_gradient.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_gradient.hpp @@ -113,14 +113,14 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel // Evaluate first derivative of bfn wrt x basis_x_eval[ipt + 0*npts] = sqrt_3*y*(radial_eval + radial_eval_alpha*x*x); basis_x_eval[ipt + 1*npts] = sqrt_3*radial_eval_alpha*x*y*z; - basis_x_eval[ipt + 2*npts] = -x*(2*radial_eval + radial_eval_alpha*(x*x + y*y - 2*z*z))/2; + basis_x_eval[ipt + 2*npts] = x*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; basis_x_eval[ipt + 3*npts] = sqrt_3*z*(radial_eval + radial_eval_alpha*x*x); basis_x_eval[ipt + 4*npts] = sqrt_3*x*(2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; // Evaluate first derivative of bfn wrt y basis_y_eval[ipt + 0*npts] = sqrt_3*x*(radial_eval + radial_eval_alpha*y*y); basis_y_eval[ipt + 1*npts] = sqrt_3*z*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 2*npts] = -y*(2*radial_eval + radial_eval_alpha*(x*x + y*y - 2*z*z))/2; + basis_y_eval[ipt + 2*npts] = y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; basis_y_eval[ipt + 3*npts] = sqrt_3*radial_eval_alpha*x*y*z; basis_y_eval[ipt + 4*npts] = sqrt_3*y*(-2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; @@ -170,8 +170,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel dang_eval_x_1 = sqrt_3*radial_eval_alpha*x*y*z; dang_eval_y_1 = sqrt_3*z*(radial_eval + radial_eval_alpha*y*y); dang_eval_z_1 = sqrt_3*y*(radial_eval + radial_eval_alpha*z*z); - dang_eval_x_2 = -x*(2*radial_eval + radial_eval_alpha*(x*x + y*y - 2*z*z))/2; - dang_eval_y_2 = -y*(2*radial_eval + radial_eval_alpha*(x*x + y*y - 2*z*z))/2; + dang_eval_x_2 = x*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; + dang_eval_y_2 = y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; dang_eval_z_2 = z*(4*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; dang_eval_x_3 = sqrt_3*z*(radial_eval + radial_eval_alpha*x*x); dang_eval_y_3 = sqrt_3*radial_eval_alpha*x*y*z; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_hessian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_hessian.hpp index abf51d9b..b2224b32 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_hessian.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_hessian.hpp @@ -122,14 +122,14 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel // Evaluate first derivative of bfn wrt x basis_x_eval[ipt + 0*npts] = sqrt_3*y*(radial_eval + radial_eval_alpha*x*x); basis_x_eval[ipt + 1*npts] = sqrt_3*radial_eval_alpha*x*y*z; - basis_x_eval[ipt + 2*npts] = -x*(2*radial_eval + radial_eval_alpha*(x*x + y*y - 2*z*z))/2; + basis_x_eval[ipt + 2*npts] = x*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; basis_x_eval[ipt + 3*npts] = sqrt_3*z*(radial_eval + radial_eval_alpha*x*x); basis_x_eval[ipt + 4*npts] = sqrt_3*x*(2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; // Evaluate first derivative of bfn wrt y basis_y_eval[ipt + 0*npts] = sqrt_3*x*(radial_eval + radial_eval_alpha*y*y); basis_y_eval[ipt + 1*npts] = sqrt_3*z*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 2*npts] = -y*(2*radial_eval + radial_eval_alpha*(x*x + y*y - 2*z*z))/2; + basis_y_eval[ipt + 2*npts] = y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; basis_y_eval[ipt + 3*npts] = sqrt_3*radial_eval_alpha*x*y*z; basis_y_eval[ipt + 4*npts] = sqrt_3*y*(-2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; @@ -150,7 +150,7 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel // Evaluate second derivative of bfn wrt xy basis_xy_eval[ipt + 0*npts] = sqrt_3*(radial_eval + radial_eval_alpha*x*x + radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y); basis_xy_eval[ipt + 1*npts] = sqrt_3*x*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_xy_eval[ipt + 2*npts] = -x*y*(4*radial_eval_alpha + radial_eval_alpha_squared*(x*x + y*y - 2*z*z))/2; + basis_xy_eval[ipt + 2*npts] = x*y*(-4*radial_eval_alpha - radial_eval_alpha_squared*(x*x + y*y - 2*z*z))/2; basis_xy_eval[ipt + 3*npts] = sqrt_3*y*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x); basis_xy_eval[ipt + 4*npts] = sqrt_3*radial_eval_alpha_squared*x*y*(x*x - y*y)/2; @@ -220,8 +220,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel dang_eval_x_1 = sqrt_3*radial_eval_alpha*x*y*z; dang_eval_y_1 = sqrt_3*z*(radial_eval + radial_eval_alpha*y*y); dang_eval_z_1 = sqrt_3*y*(radial_eval + radial_eval_alpha*z*z); - dang_eval_x_2 = -x*(2*radial_eval + radial_eval_alpha*(x*x + y*y - 2*z*z))/2; - dang_eval_y_2 = -y*(2*radial_eval + radial_eval_alpha*(x*x + y*y - 2*z*z))/2; + dang_eval_x_2 = x*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; + dang_eval_y_2 = y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; dang_eval_z_2 = z*(4*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; dang_eval_x_3 = sqrt_3*z*(radial_eval + radial_eval_alpha*x*x); dang_eval_y_3 = sqrt_3*radial_eval_alpha*x*y*z; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_gradient.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_gradient.hpp index c6331457..dbe7b066 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_gradient.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_gradient.hpp @@ -115,18 +115,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel // Evaluate first derivative of bfn wrt x basis_x_eval[ipt + 0*npts] = sqrt_10*x*y*(6*radial_eval + radial_eval_alpha*(3*x*x - y*y))/4; basis_x_eval[ipt + 1*npts] = sqrt_15*y*z*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 2*npts] = -sqrt_6*x*y*(2*radial_eval + radial_eval_alpha*(x*x + y*y - 4*z*z))/4; - basis_x_eval[ipt + 3*npts] = -x*z*(6*radial_eval + radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; - basis_x_eval[ipt + 4*npts] = -sqrt_6*(radial_eval*(3*x*x + y*y - 4*z*z) + radial_eval_alpha*x*x*(x*x + y*y - 4*z*z))/4; + basis_x_eval[ipt + 2*npts] = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; + basis_x_eval[ipt + 3*npts] = x*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; + basis_x_eval[ipt + 4*npts] = sqrt_6*(-radial_eval*(3*x*x + y*y - 4*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 4*z*z))/4; basis_x_eval[ipt + 5*npts] = sqrt_15*x*z*(2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; basis_x_eval[ipt + 6*npts] = sqrt_10*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y))/4; // Evaluate first derivative of bfn wrt y basis_y_eval[ipt + 0*npts] = sqrt_10*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y))/4; basis_y_eval[ipt + 1*npts] = sqrt_15*x*z*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 2*npts] = -sqrt_6*(radial_eval*(x*x + 3*y*y - 4*z*z) + radial_eval_alpha*y*y*(x*x + y*y - 4*z*z))/4; - basis_y_eval[ipt + 3*npts] = -y*z*(6*radial_eval + radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; - basis_y_eval[ipt + 4*npts] = -sqrt_6*x*y*(2*radial_eval + radial_eval_alpha*(x*x + y*y - 4*z*z))/4; + basis_y_eval[ipt + 2*npts] = sqrt_6*(-radial_eval*(x*x + 3*y*y - 4*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 4*z*z))/4; + basis_y_eval[ipt + 3*npts] = y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; + basis_y_eval[ipt + 4*npts] = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; basis_y_eval[ipt + 5*npts] = sqrt_15*y*z*(-2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; basis_y_eval[ipt + 6*npts] = sqrt_10*x*y*(-6*radial_eval + radial_eval_alpha*(x*x - 3*y*y))/4; @@ -182,11 +182,11 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel dang_eval_x_1 = sqrt_15*y*z*(radial_eval + radial_eval_alpha*x*x); dang_eval_y_1 = sqrt_15*x*z*(radial_eval + radial_eval_alpha*y*y); dang_eval_z_1 = sqrt_15*x*y*(radial_eval + radial_eval_alpha*z*z); - dang_eval_x_2 = -sqrt_6*x*y*(2*radial_eval + radial_eval_alpha*(x*x + y*y - 4*z*z))/4; - dang_eval_y_2 = -sqrt_6*(radial_eval*(x*x + 3*y*y - 4*z*z) + radial_eval_alpha*y*y*(x*x + y*y - 4*z*z))/4; + dang_eval_x_2 = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; + dang_eval_y_2 = sqrt_6*(-radial_eval*(x*x + 3*y*y - 4*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 4*z*z))/4; dang_eval_z_2 = sqrt_6*y*z*(8*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; - dang_eval_x_3 = -x*z*(6*radial_eval + radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; - dang_eval_y_3 = -y*z*(6*radial_eval + radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; + dang_eval_x_3 = x*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; + dang_eval_y_3 = y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; dang_eval_z_3 = -3*radial_eval*(x*x + y*y - 2*z*z)/2 - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 2*z*z)/2; basis_x_eval[ipt + 0*npts] = dang_eval_x_0; basis_y_eval[ipt + 0*npts] = dang_eval_y_0; @@ -201,8 +201,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel basis_y_eval[ipt + 3*npts] = dang_eval_y_3; basis_z_eval[ipt + 3*npts] = dang_eval_z_3; - dang_eval_x_0 = -sqrt_6*(radial_eval*(3*x*x + y*y - 4*z*z) + radial_eval_alpha*x*x*(x*x + y*y - 4*z*z))/4; - dang_eval_y_0 = -sqrt_6*x*y*(2*radial_eval + radial_eval_alpha*(x*x + y*y - 4*z*z))/4; + dang_eval_x_0 = sqrt_6*(-radial_eval*(3*x*x + y*y - 4*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 4*z*z))/4; + dang_eval_y_0 = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; dang_eval_z_0 = sqrt_6*x*z*(8*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; dang_eval_x_1 = sqrt_15*x*z*(2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; dang_eval_y_1 = sqrt_15*y*z*(-2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_hessian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_hessian.hpp index 8f7c337c..1d7165a8 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_hessian.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_hessian.hpp @@ -124,18 +124,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel // Evaluate first derivative of bfn wrt x basis_x_eval[ipt + 0*npts] = sqrt_10*x*y*(6*radial_eval + radial_eval_alpha*(3*x*x - y*y))/4; basis_x_eval[ipt + 1*npts] = sqrt_15*y*z*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 2*npts] = -sqrt_6*x*y*(2*radial_eval + radial_eval_alpha*(x*x + y*y - 4*z*z))/4; - basis_x_eval[ipt + 3*npts] = -x*z*(6*radial_eval + radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; - basis_x_eval[ipt + 4*npts] = -sqrt_6*(radial_eval*(3*x*x + y*y - 4*z*z) + radial_eval_alpha*x*x*(x*x + y*y - 4*z*z))/4; + basis_x_eval[ipt + 2*npts] = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; + basis_x_eval[ipt + 3*npts] = x*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; + basis_x_eval[ipt + 4*npts] = sqrt_6*(-radial_eval*(3*x*x + y*y - 4*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 4*z*z))/4; basis_x_eval[ipt + 5*npts] = sqrt_15*x*z*(2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; basis_x_eval[ipt + 6*npts] = sqrt_10*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y))/4; // Evaluate first derivative of bfn wrt y basis_y_eval[ipt + 0*npts] = sqrt_10*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y))/4; basis_y_eval[ipt + 1*npts] = sqrt_15*x*z*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 2*npts] = -sqrt_6*(radial_eval*(x*x + 3*y*y - 4*z*z) + radial_eval_alpha*y*y*(x*x + y*y - 4*z*z))/4; - basis_y_eval[ipt + 3*npts] = -y*z*(6*radial_eval + radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; - basis_y_eval[ipt + 4*npts] = -sqrt_6*x*y*(2*radial_eval + radial_eval_alpha*(x*x + y*y - 4*z*z))/4; + basis_y_eval[ipt + 2*npts] = sqrt_6*(-radial_eval*(x*x + 3*y*y - 4*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 4*z*z))/4; + basis_y_eval[ipt + 3*npts] = y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; + basis_y_eval[ipt + 4*npts] = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; basis_y_eval[ipt + 5*npts] = sqrt_15*y*z*(-2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; basis_y_eval[ipt + 6*npts] = sqrt_10*x*y*(-6*radial_eval + radial_eval_alpha*(x*x - 3*y*y))/4; @@ -151,18 +151,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel // Evaluate second derivative of bfn wrt xx basis_xx_eval[ipt + 0*npts] = sqrt_10*y*(6*radial_eval + 12*radial_eval_alpha*x*x + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(3*x*x - y*y))/4; basis_xx_eval[ipt + 1*npts] = sqrt_15*x*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xx_eval[ipt + 2*npts] = -sqrt_6*y*(2*radial_eval + 4*radial_eval_alpha*x*x + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x + y*y - 4*z*z))/4; - basis_xx_eval[ipt + 3*npts] = -z*(6*radial_eval + 12*radial_eval_alpha*x*x + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(3*x*x + 3*y*y - 2*z*z))/2; - basis_xx_eval[ipt + 4*npts] = -sqrt_6*x*(6*radial_eval + 2*radial_eval_alpha*(3*x*x + y*y - 4*z*z) + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x + y*y - 4*z*z))/4; + basis_xx_eval[ipt + 2*npts] = sqrt_6*y*(-2*radial_eval - 4*radial_eval_alpha*x*x - (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x + y*y - 4*z*z))/4; + basis_xx_eval[ipt + 3*npts] = z*(-6*radial_eval - 12*radial_eval_alpha*x*x - (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(3*x*x + 3*y*y - 2*z*z))/2; + basis_xx_eval[ipt + 4*npts] = sqrt_6*x*(-6*radial_eval - 2*radial_eval_alpha*(3*x*x + y*y - 4*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x + y*y - 4*z*z))/4; basis_xx_eval[ipt + 5*npts] = sqrt_15*z*(2*radial_eval + 4*radial_eval_alpha*x*x + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x - y*y))/2; basis_xx_eval[ipt + 6*npts] = sqrt_10*x*(6*radial_eval + 6*radial_eval_alpha*(x*x - y*y) + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x - 3*y*y))/4; // Evaluate second derivative of bfn wrt xy basis_xy_eval[ipt + 0*npts] = sqrt_10*x*(6*radial_eval + 3*radial_eval_alpha*x*x + 3*radial_eval_alpha*y*y + 3*radial_eval_alpha_squared*x*x*y*y - radial_eval_alpha_squared*y*y*y*y)/4; basis_xy_eval[ipt + 1*npts] = sqrt_15*z*(radial_eval + radial_eval_alpha*x*x + radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y); - basis_xy_eval[ipt + 2*npts] = -sqrt_6*x*(2*radial_eval + 2*radial_eval_alpha*y*y + radial_eval_alpha*(x*x + 3*y*y - 4*z*z) + radial_eval_alpha_squared*y*y*(x*x + y*y - 4*z*z))/4; - basis_xy_eval[ipt + 3*npts] = -x*y*z*(12*radial_eval_alpha + radial_eval_alpha_squared*(3*x*x + 3*y*y - 2*z*z))/2; - basis_xy_eval[ipt + 4*npts] = -sqrt_6*y*(2*radial_eval + 2*radial_eval_alpha*x*x + radial_eval_alpha*(3*x*x + y*y - 4*z*z) + radial_eval_alpha_squared*x*x*(x*x + y*y - 4*z*z))/4; + basis_xy_eval[ipt + 2*npts] = sqrt_6*x*(-2*radial_eval - 2*radial_eval_alpha*y*y - radial_eval_alpha*(x*x + 3*y*y - 4*z*z) - radial_eval_alpha_squared*y*y*(x*x + y*y - 4*z*z))/4; + basis_xy_eval[ipt + 3*npts] = x*y*z*(-12*radial_eval_alpha - radial_eval_alpha_squared*(3*x*x + 3*y*y - 2*z*z))/2; + basis_xy_eval[ipt + 4*npts] = sqrt_6*y*(-2*radial_eval - 2*radial_eval_alpha*x*x - radial_eval_alpha*(3*x*x + y*y - 4*z*z) - radial_eval_alpha_squared*x*x*(x*x + y*y - 4*z*z))/4; basis_xy_eval[ipt + 5*npts] = sqrt_15*radial_eval_alpha_squared*x*y*z*(x*x - y*y)/2; basis_xy_eval[ipt + 6*npts] = sqrt_10*y*(-6*radial_eval - 3*radial_eval_alpha*x*x - 3*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*x*x - 3*radial_eval_alpha_squared*x*x*y*y)/4; @@ -178,9 +178,9 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel // Evaluate second derivative of bfn wrt yy basis_yy_eval[ipt + 0*npts] = sqrt_10*y*(-6*radial_eval - 6*radial_eval_alpha*(-x*x + y*y) + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(3*x*x - y*y))/4; basis_yy_eval[ipt + 1*npts] = sqrt_15*x*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yy_eval[ipt + 2*npts] = -sqrt_6*y*(6*radial_eval + 2*radial_eval_alpha*(x*x + 3*y*y - 4*z*z) + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x + y*y - 4*z*z))/4; - basis_yy_eval[ipt + 3*npts] = -z*(6*radial_eval + 12*radial_eval_alpha*y*y + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(3*x*x + 3*y*y - 2*z*z))/2; - basis_yy_eval[ipt + 4*npts] = -sqrt_6*x*(2*radial_eval + 4*radial_eval_alpha*y*y + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x + y*y - 4*z*z))/4; + basis_yy_eval[ipt + 2*npts] = sqrt_6*y*(-6*radial_eval - 2*radial_eval_alpha*(x*x + 3*y*y - 4*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x + y*y - 4*z*z))/4; + basis_yy_eval[ipt + 3*npts] = z*(-6*radial_eval - 12*radial_eval_alpha*y*y - (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(3*x*x + 3*y*y - 2*z*z))/2; + basis_yy_eval[ipt + 4*npts] = sqrt_6*x*(-2*radial_eval - 4*radial_eval_alpha*y*y - (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x + y*y - 4*z*z))/4; basis_yy_eval[ipt + 5*npts] = sqrt_15*z*(-2*radial_eval - 4*radial_eval_alpha*y*y + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x - y*y))/2; basis_yy_eval[ipt + 6*npts] = sqrt_10*x*(-6*radial_eval - 12*radial_eval_alpha*y*y + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x - 3*y*y))/4; @@ -244,11 +244,11 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel dang_eval_x_1 = sqrt_15*y*z*(radial_eval + radial_eval_alpha*x*x); dang_eval_y_1 = sqrt_15*x*z*(radial_eval + radial_eval_alpha*y*y); dang_eval_z_1 = sqrt_15*x*y*(radial_eval + radial_eval_alpha*z*z); - dang_eval_x_2 = -sqrt_6*x*y*(2*radial_eval + radial_eval_alpha*(x*x + y*y - 4*z*z))/4; - dang_eval_y_2 = -sqrt_6*(radial_eval*(x*x + 3*y*y - 4*z*z) + radial_eval_alpha*y*y*(x*x + y*y - 4*z*z))/4; + dang_eval_x_2 = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; + dang_eval_y_2 = sqrt_6*(-radial_eval*(x*x + 3*y*y - 4*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 4*z*z))/4; dang_eval_z_2 = sqrt_6*y*z*(8*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; - dang_eval_x_3 = -x*z*(6*radial_eval + radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; - dang_eval_y_3 = -y*z*(6*radial_eval + radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; + dang_eval_x_3 = x*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; + dang_eval_y_3 = y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; dang_eval_z_3 = -3*radial_eval*(x*x + y*y - 2*z*z)/2 - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 2*z*z)/2; basis_x_eval[ipt + 0*npts] = dang_eval_x_0; basis_y_eval[ipt + 0*npts] = dang_eval_y_0; @@ -263,8 +263,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel basis_y_eval[ipt + 3*npts] = dang_eval_y_3; basis_z_eval[ipt + 3*npts] = dang_eval_z_3; - dang_eval_x_0 = -sqrt_6*(radial_eval*(3*x*x + y*y - 4*z*z) + radial_eval_alpha*x*x*(x*x + y*y - 4*z*z))/4; - dang_eval_y_0 = -sqrt_6*x*y*(2*radial_eval + radial_eval_alpha*(x*x + y*y - 4*z*z))/4; + dang_eval_x_0 = sqrt_6*(-radial_eval*(3*x*x + y*y - 4*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 4*z*z))/4; + dang_eval_y_0 = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; dang_eval_z_0 = sqrt_6*x*z*(8*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; dang_eval_x_1 = sqrt_15*x*z*(2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; dang_eval_y_1 = sqrt_15*y*z*(-2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4.hpp new file mode 100644 index 00000000..075d943e --- /dev/null +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4.hpp @@ -0,0 +1,156 @@ +/** + * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * + * See LICENSE.txt for details + */ +#pragma once +#include "collocation_device_constants.hpp" +#include "device/xc_device_task.hpp" +#include "device_specific/cuda_device_constants.hpp" +#include "device/common/shell_to_task.hpp" +#include + +namespace GauXC { + + +__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_spherical_4( + uint32_t nshell, + ShellToTaskDevice* __restrict__ shell_to_task, + XCDeviceTask* __restrict__ device_tasks +) { + + + __shared__ double alpha[16][detail::shell_nprim_max + 1]; + __shared__ double coeff[16][detail::shell_nprim_max + 1]; + double* my_alpha = alpha[threadIdx.x/32]; + double* my_coeff = coeff[threadIdx.x/32]; + + for( auto ish = blockIdx.z; ish < nshell; ish += gridDim.z ) { + const uint32_t ntasks = shell_to_task[ish].ntask; + const auto shell = shell_to_task[ish].shell_device; + const auto task_idx = shell_to_task[ish].task_idx_device; + const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device; + + + // Load Shell Data into registers / SM + const uint32_t nprim = shell->nprim(); + const double3 O = *reinterpret_cast(shell->O_data()); + + const int global_warp_id = (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size; + const int nwarp_global = max((blockDim.x*gridDim.x) / cuda::warp_size,1); + + // Read in coeffs/exps into SM on first warp + { + auto* coeff_gm = shell->coeff_data(); + auto* alpha_gm = shell->alpha_data(); + static_assert( detail::shell_nprim_max == cuda::warp_size ); + const int warp_rank = threadIdx.x % cuda::warp_size; + my_alpha[warp_rank] = alpha_gm[warp_rank]; + my_coeff[warp_rank] = coeff_gm[warp_rank]; + } + + // Loop over tasks assigned to shells + // Place each task on a different warp + schedule across blocks + for( int itask = global_warp_id; itask < ntasks; itask += nwarp_global ) { + + const auto* task = device_tasks + task_idx[itask]; + const auto* __restrict__ points_x = task->points_x; + const auto* __restrict__ points_y = task->points_y; + const auto* __restrict__ points_z = task->points_z; + const uint32_t npts = task->npts; + const size_t shoff = task_shell_offs[itask] * npts; + + auto* __restrict__ basis_eval = task->bf + shoff; + + + // Loop over points in task + // Assign each point to separate thread within the warp + #pragma unroll 1 + for( int ipt = threadIdx.x % cuda::warp_size; ipt < npts; ipt += cuda::warp_size ) { + //const double3 point = points[ipt]; + double3 point; + point.x = points_x[ipt]; + point.y = points_y[ipt]; + point.z = points_z[ipt]; + + + const auto x = point.x - O.x; + const auto y = point.y - O.y; + const auto z = point.z - O.z; + const auto rsq = x*x + y*y + z*z; + + // Evaluate radial part of bfn + double radial_eval = 0.; + + #pragma unroll 1 + for( uint32_t i = 0; i < nprim; ++i ) { + const auto a = my_alpha[i]; + const auto e = my_coeff[i] * std::exp( - a * rsq ); + + radial_eval += e; + } + + + + + // Evaluate basis function + basis_eval[ipt + 0*npts] = sqrt_35*radial_eval*x*y*(x*x - y*y)/2; + basis_eval[ipt + 1*npts] = sqrt_70*radial_eval*y*z*(3*x*x - y*y)/4; + basis_eval[ipt + 2*npts] = sqrt_5*radial_eval*x*y*(-x*x - y*y + 6*z*z)/2; + basis_eval[ipt + 3*npts] = sqrt_10*radial_eval*y*z*(-3*x*x - 3*y*y + 4*z*z)/4; + basis_eval[ipt + 4*npts] = radial_eval*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; + basis_eval[ipt + 5*npts] = sqrt_10*radial_eval*x*z*(-3*x*x - 3*y*y + 4*z*z)/4; + basis_eval[ipt + 6*npts] = sqrt_5*radial_eval*(-x*x*x*x + 6*x*x*z*z + y*y*y*y - 6*y*y*z*z)/4; + basis_eval[ipt + 7*npts] = sqrt_70*radial_eval*x*z*(x*x - 3*y*y)/4; + basis_eval[ipt + 8*npts] = sqrt_35*radial_eval*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + + + + + + + + +#if 0 + // Evaluate the angular part of bfn + + + + double ang_eval_0; + double ang_eval_1; + double ang_eval_2; + double ang_eval_3; + + + ang_eval_0 = sqrt_35*radial_eval*x*y*(x*x - y*y)/2; + ang_eval_1 = sqrt_70*radial_eval*y*z*(3*x*x - y*y)/4; + ang_eval_2 = sqrt_5*radial_eval*x*y*(-x*x - y*y + 6*z*z)/2; + ang_eval_3 = sqrt_10*radial_eval*y*z*(-3*x*x - 3*y*y + 4*z*z)/4; + basis_eval[ipt + 0*npts] = ang_eval_0; + basis_eval[ipt + 1*npts] = ang_eval_1; + basis_eval[ipt + 2*npts] = ang_eval_2; + basis_eval[ipt + 3*npts] = ang_eval_3; + + ang_eval_0 = radial_eval*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; + ang_eval_1 = sqrt_10*radial_eval*x*z*(-3*x*x - 3*y*y + 4*z*z)/4; + ang_eval_2 = sqrt_5*radial_eval*(-x*x*x*x + 6*x*x*z*z + y*y*y*y - 6*y*y*z*z)/4; + ang_eval_3 = sqrt_70*radial_eval*x*z*(x*x - 3*y*y)/4; + basis_eval[ipt + 4*npts] = ang_eval_0; + basis_eval[ipt + 5*npts] = ang_eval_1; + basis_eval[ipt + 6*npts] = ang_eval_2; + basis_eval[ipt + 7*npts] = ang_eval_3; + + ang_eval_0 = sqrt_35*radial_eval*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + basis_eval[ipt + 8*npts] = ang_eval_0; + + +#endif + } // Loop over points within task + } // Loop over tasks + + } // Loop over shells +} // end kernel + +} // namespace GauXC diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_gradient.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_gradient.hpp new file mode 100644 index 00000000..0dc3f241 --- /dev/null +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_gradient.hpp @@ -0,0 +1,256 @@ +/** + * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * + * See LICENSE.txt for details + */ +#pragma once +#include "collocation_device_constants.hpp" +#include "device/xc_device_task.hpp" +#include "device_specific/cuda_device_constants.hpp" +#include "device/common/shell_to_task.hpp" +#include + +namespace GauXC { + + +__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_spherical_gradient_4( + uint32_t nshell, + ShellToTaskDevice* __restrict__ shell_to_task, + XCDeviceTask* __restrict__ device_tasks +) { + + + __shared__ double alpha[16][detail::shell_nprim_max + 1]; + __shared__ double coeff[16][detail::shell_nprim_max + 1]; + double* my_alpha = alpha[threadIdx.x/32]; + double* my_coeff = coeff[threadIdx.x/32]; + + for( auto ish = blockIdx.z; ish < nshell; ish += gridDim.z ) { + const uint32_t ntasks = shell_to_task[ish].ntask; + const auto shell = shell_to_task[ish].shell_device; + const auto task_idx = shell_to_task[ish].task_idx_device; + const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device; + + + // Load Shell Data into registers / SM + const uint32_t nprim = shell->nprim(); + const double3 O = *reinterpret_cast(shell->O_data()); + + const int global_warp_id = (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size; + const int nwarp_global = max((blockDim.x*gridDim.x) / cuda::warp_size,1); + + // Read in coeffs/exps into SM on first warp + { + auto* coeff_gm = shell->coeff_data(); + auto* alpha_gm = shell->alpha_data(); + static_assert( detail::shell_nprim_max == cuda::warp_size ); + const int warp_rank = threadIdx.x % cuda::warp_size; + my_alpha[warp_rank] = alpha_gm[warp_rank]; + my_coeff[warp_rank] = coeff_gm[warp_rank]; + } + + // Loop over tasks assigned to shells + // Place each task on a different warp + schedule across blocks + for( int itask = global_warp_id; itask < ntasks; itask += nwarp_global ) { + + const auto* task = device_tasks + task_idx[itask]; + const auto* __restrict__ points_x = task->points_x; + const auto* __restrict__ points_y = task->points_y; + const auto* __restrict__ points_z = task->points_z; + const uint32_t npts = task->npts; + const size_t shoff = task_shell_offs[itask] * npts; + + auto* __restrict__ basis_eval = task->bf + shoff; + auto* __restrict__ basis_x_eval = task->dbfx + shoff; + auto* __restrict__ basis_y_eval = task->dbfy + shoff; + auto* __restrict__ basis_z_eval = task->dbfz + shoff; + + + // Loop over points in task + // Assign each point to separate thread within the warp + #pragma unroll 1 + for( int ipt = threadIdx.x % cuda::warp_size; ipt < npts; ipt += cuda::warp_size ) { + //const double3 point = points[ipt]; + double3 point; + point.x = points_x[ipt]; + point.y = points_y[ipt]; + point.z = points_z[ipt]; + + + const auto x = point.x - O.x; + const auto y = point.y - O.y; + const auto z = point.z - O.z; + const auto rsq = x*x + y*y + z*z; + + // Evaluate radial part of bfn + double radial_eval = 0.; + double radial_eval_alpha = 0.; + + #pragma unroll 1 + for( uint32_t i = 0; i < nprim; ++i ) { + const auto a = my_alpha[i]; + const auto e = my_coeff[i] * std::exp( - a * rsq ); + + radial_eval += e; + radial_eval_alpha += a * e; + } + + radial_eval_alpha *= -2; + + + + // Evaluate basis function + basis_eval[ipt + 0*npts] = sqrt_35*radial_eval*x*y*(x*x - y*y)/2; + basis_eval[ipt + 1*npts] = sqrt_70*radial_eval*y*z*(3*x*x - y*y)/4; + basis_eval[ipt + 2*npts] = sqrt_5*radial_eval*x*y*(-x*x - y*y + 6*z*z)/2; + basis_eval[ipt + 3*npts] = sqrt_10*radial_eval*y*z*(-3*x*x - 3*y*y + 4*z*z)/4; + basis_eval[ipt + 4*npts] = radial_eval*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; + basis_eval[ipt + 5*npts] = sqrt_10*radial_eval*x*z*(-3*x*x - 3*y*y + 4*z*z)/4; + basis_eval[ipt + 6*npts] = sqrt_5*radial_eval*(-x*x*x*x + 6*x*x*z*z + y*y*y*y - 6*y*y*z*z)/4; + basis_eval[ipt + 7*npts] = sqrt_70*radial_eval*x*z*(x*x - 3*y*y)/4; + basis_eval[ipt + 8*npts] = sqrt_35*radial_eval*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + + + + // Evaluate first derivative of bfn wrt x + basis_x_eval[ipt + 0*npts] = sqrt_35*y*(radial_eval*(3*x*x - y*y) + radial_eval_alpha*x*x*(x*x - y*y))/2; + basis_x_eval[ipt + 1*npts] = sqrt_70*x*y*z*(6*radial_eval + radial_eval_alpha*(3*x*x - y*y))/4; + basis_x_eval[ipt + 2*npts] = sqrt_5*y*(-radial_eval*(3*x*x + y*y - 6*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 6*z*z))/2; + basis_x_eval[ipt + 3*npts] = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4; + basis_x_eval[ipt + 4*npts] = x*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; + basis_x_eval[ipt + 5*npts] = sqrt_10*z*(-radial_eval*(9*x*x + 3*y*y - 4*z*z) - radial_eval_alpha*x*x*(3*x*x + 3*y*y - 4*z*z))/4; + basis_x_eval[ipt + 6*npts] = sqrt_5*x*(-4*radial_eval*(x*x - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + basis_x_eval[ipt + 7*npts] = sqrt_70*z*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y))/4; + basis_x_eval[ipt + 8*npts] = sqrt_35*x*(4*radial_eval*(x*x - 3*y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + + // Evaluate first derivative of bfn wrt y + basis_y_eval[ipt + 0*npts] = sqrt_35*x*(-radial_eval*(-x*x + 3*y*y) + radial_eval_alpha*y*y*(x*x - y*y))/2; + basis_y_eval[ipt + 1*npts] = sqrt_70*z*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y))/4; + basis_y_eval[ipt + 2*npts] = sqrt_5*x*(-radial_eval*(x*x + 3*y*y - 6*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 6*z*z))/2; + basis_y_eval[ipt + 3*npts] = sqrt_10*z*(-radial_eval*(3*x*x + 9*y*y - 4*z*z) - radial_eval_alpha*y*y*(3*x*x + 3*y*y - 4*z*z))/4; + basis_y_eval[ipt + 4*npts] = y*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; + basis_y_eval[ipt + 5*npts] = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4; + basis_y_eval[ipt + 6*npts] = sqrt_5*y*(4*radial_eval*(y*y - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + basis_y_eval[ipt + 7*npts] = sqrt_70*x*y*z*(-6*radial_eval + radial_eval_alpha*(x*x - 3*y*y))/4; + basis_y_eval[ipt + 8*npts] = sqrt_35*y*(-4*radial_eval*(3*x*x - y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + + // Evaluate first derivative of bfn wrt z + basis_z_eval[ipt + 0*npts] = sqrt_35*radial_eval_alpha*x*y*z*(x*x - y*y)/2; + basis_z_eval[ipt + 1*npts] = sqrt_70*y*(radial_eval + radial_eval_alpha*z*z)*(3*x*x - y*y)/4; + basis_z_eval[ipt + 2*npts] = sqrt_5*x*y*z*(12*radial_eval - radial_eval_alpha*(x*x + y*y - 6*z*z))/2; + basis_z_eval[ipt + 3*npts] = sqrt_10*y*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4; + basis_z_eval[ipt + 4*npts] = z*(-16*radial_eval*(3*x*x + 3*y*y - 2*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; + basis_z_eval[ipt + 5*npts] = sqrt_10*x*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4; + basis_z_eval[ipt + 6*npts] = sqrt_5*z*(12*radial_eval*(x*x - y*y) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + basis_z_eval[ipt + 7*npts] = sqrt_70*x*(radial_eval + radial_eval_alpha*z*z)*(x*x - 3*y*y)/4; + basis_z_eval[ipt + 8*npts] = sqrt_35*radial_eval_alpha*z*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + + + + + +#if 0 + // Evaluate the angular part of bfn + + + + double ang_eval_0; + double ang_eval_1; + double ang_eval_2; + double ang_eval_3; + + + ang_eval_0 = sqrt_35*radial_eval*x*y*(x*x - y*y)/2; + ang_eval_1 = sqrt_70*radial_eval*y*z*(3*x*x - y*y)/4; + ang_eval_2 = sqrt_5*radial_eval*x*y*(-x*x - y*y + 6*z*z)/2; + ang_eval_3 = sqrt_10*radial_eval*y*z*(-3*x*x - 3*y*y + 4*z*z)/4; + basis_eval[ipt + 0*npts] = ang_eval_0; + basis_eval[ipt + 1*npts] = ang_eval_1; + basis_eval[ipt + 2*npts] = ang_eval_2; + basis_eval[ipt + 3*npts] = ang_eval_3; + + ang_eval_0 = radial_eval*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; + ang_eval_1 = sqrt_10*radial_eval*x*z*(-3*x*x - 3*y*y + 4*z*z)/4; + ang_eval_2 = sqrt_5*radial_eval*(-x*x*x*x + 6*x*x*z*z + y*y*y*y - 6*y*y*z*z)/4; + ang_eval_3 = sqrt_70*radial_eval*x*z*(x*x - 3*y*y)/4; + basis_eval[ipt + 4*npts] = ang_eval_0; + basis_eval[ipt + 5*npts] = ang_eval_1; + basis_eval[ipt + 6*npts] = ang_eval_2; + basis_eval[ipt + 7*npts] = ang_eval_3; + + ang_eval_0 = sqrt_35*radial_eval*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + basis_eval[ipt + 8*npts] = ang_eval_0; + + + double dang_eval_x_0, dang_eval_y_0, dang_eval_z_0; + double dang_eval_x_1, dang_eval_y_1, dang_eval_z_1; + double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; + double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3; + + dang_eval_x_0 = sqrt_35*y*(radial_eval*(3*x*x - y*y) + radial_eval_alpha*x*x*(x*x - y*y))/2; + dang_eval_y_0 = sqrt_35*x*(-radial_eval*(-x*x + 3*y*y) + radial_eval_alpha*y*y*(x*x - y*y))/2; + dang_eval_z_0 = sqrt_35*radial_eval_alpha*x*y*z*(x*x - y*y)/2; + dang_eval_x_1 = sqrt_70*x*y*z*(6*radial_eval + radial_eval_alpha*(3*x*x - y*y))/4; + dang_eval_y_1 = sqrt_70*z*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y))/4; + dang_eval_z_1 = sqrt_70*y*(radial_eval + radial_eval_alpha*z*z)*(3*x*x - y*y)/4; + dang_eval_x_2 = sqrt_5*y*(-radial_eval*(3*x*x + y*y - 6*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 6*z*z))/2; + dang_eval_y_2 = sqrt_5*x*(-radial_eval*(x*x + 3*y*y - 6*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 6*z*z))/2; + dang_eval_z_2 = sqrt_5*x*y*z*(12*radial_eval - radial_eval_alpha*(x*x + y*y - 6*z*z))/2; + dang_eval_x_3 = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4; + dang_eval_y_3 = sqrt_10*z*(-radial_eval*(3*x*x + 9*y*y - 4*z*z) - radial_eval_alpha*y*y*(3*x*x + 3*y*y - 4*z*z))/4; + dang_eval_z_3 = sqrt_10*y*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4; + basis_x_eval[ipt + 0*npts] = dang_eval_x_0; + basis_y_eval[ipt + 0*npts] = dang_eval_y_0; + basis_z_eval[ipt + 0*npts] = dang_eval_z_0; + basis_x_eval[ipt + 1*npts] = dang_eval_x_1; + basis_y_eval[ipt + 1*npts] = dang_eval_y_1; + basis_z_eval[ipt + 1*npts] = dang_eval_z_1; + basis_x_eval[ipt + 2*npts] = dang_eval_x_2; + basis_y_eval[ipt + 2*npts] = dang_eval_y_2; + basis_z_eval[ipt + 2*npts] = dang_eval_z_2; + basis_x_eval[ipt + 3*npts] = dang_eval_x_3; + basis_y_eval[ipt + 3*npts] = dang_eval_y_3; + basis_z_eval[ipt + 3*npts] = dang_eval_z_3; + + dang_eval_x_0 = x*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; + dang_eval_y_0 = y*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; + dang_eval_z_0 = z*(-16*radial_eval*(3*x*x + 3*y*y - 2*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; + dang_eval_x_1 = sqrt_10*z*(-radial_eval*(9*x*x + 3*y*y - 4*z*z) - radial_eval_alpha*x*x*(3*x*x + 3*y*y - 4*z*z))/4; + dang_eval_y_1 = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4; + dang_eval_z_1 = sqrt_10*x*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4; + dang_eval_x_2 = sqrt_5*x*(-4*radial_eval*(x*x - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + dang_eval_y_2 = sqrt_5*y*(4*radial_eval*(y*y - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + dang_eval_z_2 = sqrt_5*z*(12*radial_eval*(x*x - y*y) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + dang_eval_x_3 = sqrt_70*z*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y))/4; + dang_eval_y_3 = sqrt_70*x*y*z*(-6*radial_eval + radial_eval_alpha*(x*x - 3*y*y))/4; + dang_eval_z_3 = sqrt_70*x*(radial_eval + radial_eval_alpha*z*z)*(x*x - 3*y*y)/4; + basis_x_eval[ipt + 4*npts] = dang_eval_x_0; + basis_y_eval[ipt + 4*npts] = dang_eval_y_0; + basis_z_eval[ipt + 4*npts] = dang_eval_z_0; + basis_x_eval[ipt + 5*npts] = dang_eval_x_1; + basis_y_eval[ipt + 5*npts] = dang_eval_y_1; + basis_z_eval[ipt + 5*npts] = dang_eval_z_1; + basis_x_eval[ipt + 6*npts] = dang_eval_x_2; + basis_y_eval[ipt + 6*npts] = dang_eval_y_2; + basis_z_eval[ipt + 6*npts] = dang_eval_z_2; + basis_x_eval[ipt + 7*npts] = dang_eval_x_3; + basis_y_eval[ipt + 7*npts] = dang_eval_y_3; + basis_z_eval[ipt + 7*npts] = dang_eval_z_3; + + dang_eval_x_0 = sqrt_35*x*(4*radial_eval*(x*x - 3*y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + dang_eval_y_0 = sqrt_35*y*(-4*radial_eval*(3*x*x - y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + dang_eval_z_0 = sqrt_35*radial_eval_alpha*z*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + basis_x_eval[ipt + 8*npts] = dang_eval_x_0; + basis_y_eval[ipt + 8*npts] = dang_eval_y_0; + basis_z_eval[ipt + 8*npts] = dang_eval_z_0; + +#endif + } // Loop over points within task + } // Loop over tasks + + } // Loop over shells +} // end kernel + +} // namespace GauXC diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_hessian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_hessian.hpp new file mode 100644 index 00000000..01b2f7cc --- /dev/null +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_hessian.hpp @@ -0,0 +1,330 @@ +/** + * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * + * See LICENSE.txt for details + */ +#pragma once +#include "collocation_device_constants.hpp" +#include "device/xc_device_task.hpp" +#include "device_specific/cuda_device_constants.hpp" +#include "device/common/shell_to_task.hpp" +#include + +namespace GauXC { + + +__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_spherical_hessian_4( + uint32_t nshell, + ShellToTaskDevice* __restrict__ shell_to_task, + XCDeviceTask* __restrict__ device_tasks +) { + + + __shared__ double alpha[16][detail::shell_nprim_max + 1]; + __shared__ double coeff[16][detail::shell_nprim_max + 1]; + double* my_alpha = alpha[threadIdx.x/32]; + double* my_coeff = coeff[threadIdx.x/32]; + + for( auto ish = blockIdx.z; ish < nshell; ish += gridDim.z ) { + const uint32_t ntasks = shell_to_task[ish].ntask; + const auto shell = shell_to_task[ish].shell_device; + const auto task_idx = shell_to_task[ish].task_idx_device; + const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device; + + + // Load Shell Data into registers / SM + const uint32_t nprim = shell->nprim(); + const double3 O = *reinterpret_cast(shell->O_data()); + + const int global_warp_id = (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size; + const int nwarp_global = max((blockDim.x*gridDim.x) / cuda::warp_size,1); + + // Read in coeffs/exps into SM on first warp + { + auto* coeff_gm = shell->coeff_data(); + auto* alpha_gm = shell->alpha_data(); + static_assert( detail::shell_nprim_max == cuda::warp_size ); + const int warp_rank = threadIdx.x % cuda::warp_size; + my_alpha[warp_rank] = alpha_gm[warp_rank]; + my_coeff[warp_rank] = coeff_gm[warp_rank]; + } + + // Loop over tasks assigned to shells + // Place each task on a different warp + schedule across blocks + for( int itask = global_warp_id; itask < ntasks; itask += nwarp_global ) { + + const auto* task = device_tasks + task_idx[itask]; + const auto* __restrict__ points_x = task->points_x; + const auto* __restrict__ points_y = task->points_y; + const auto* __restrict__ points_z = task->points_z; + const uint32_t npts = task->npts; + const size_t shoff = task_shell_offs[itask] * npts; + + auto* __restrict__ basis_eval = task->bf + shoff; + auto* __restrict__ basis_x_eval = task->dbfx + shoff; + auto* __restrict__ basis_y_eval = task->dbfy + shoff; + auto* __restrict__ basis_z_eval = task->dbfz + shoff; + + auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff; + auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff; + auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff; + auto* __restrict__ basis_yy_eval = task->d2bfyy + shoff; + auto* __restrict__ basis_yz_eval = task->d2bfyz + shoff; + auto* __restrict__ basis_zz_eval = task->d2bfzz + shoff; + + // Loop over points in task + // Assign each point to separate thread within the warp + #pragma unroll 1 + for( int ipt = threadIdx.x % cuda::warp_size; ipt < npts; ipt += cuda::warp_size ) { + //const double3 point = points[ipt]; + double3 point; + point.x = points_x[ipt]; + point.y = points_y[ipt]; + point.z = points_z[ipt]; + + + const auto x = point.x - O.x; + const auto y = point.y - O.y; + const auto z = point.z - O.z; + const auto rsq = x*x + y*y + z*z; + + // Evaluate radial part of bfn + double radial_eval = 0.; + double radial_eval_alpha = 0.; + double radial_eval_alpha_squared = 0.; + + #pragma unroll 1 + for( uint32_t i = 0; i < nprim; ++i ) { + const auto a = my_alpha[i]; + const auto e = my_coeff[i] * std::exp( - a * rsq ); + + radial_eval += e; + radial_eval_alpha += a * e; + radial_eval_alpha_squared += a * a * e; + } + + radial_eval_alpha *= -2; + radial_eval_alpha_squared *= 4; + + + + // Evaluate basis function + basis_eval[ipt + 0*npts] = sqrt_35*radial_eval*x*y*(x*x - y*y)/2; + basis_eval[ipt + 1*npts] = sqrt_70*radial_eval*y*z*(3*x*x - y*y)/4; + basis_eval[ipt + 2*npts] = sqrt_5*radial_eval*x*y*(-x*x - y*y + 6*z*z)/2; + basis_eval[ipt + 3*npts] = sqrt_10*radial_eval*y*z*(-3*x*x - 3*y*y + 4*z*z)/4; + basis_eval[ipt + 4*npts] = radial_eval*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; + basis_eval[ipt + 5*npts] = sqrt_10*radial_eval*x*z*(-3*x*x - 3*y*y + 4*z*z)/4; + basis_eval[ipt + 6*npts] = sqrt_5*radial_eval*(-x*x*x*x + 6*x*x*z*z + y*y*y*y - 6*y*y*z*z)/4; + basis_eval[ipt + 7*npts] = sqrt_70*radial_eval*x*z*(x*x - 3*y*y)/4; + basis_eval[ipt + 8*npts] = sqrt_35*radial_eval*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + + + + // Evaluate first derivative of bfn wrt x + basis_x_eval[ipt + 0*npts] = sqrt_35*y*(radial_eval*(3*x*x - y*y) + radial_eval_alpha*x*x*(x*x - y*y))/2; + basis_x_eval[ipt + 1*npts] = sqrt_70*x*y*z*(6*radial_eval + radial_eval_alpha*(3*x*x - y*y))/4; + basis_x_eval[ipt + 2*npts] = sqrt_5*y*(-radial_eval*(3*x*x + y*y - 6*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 6*z*z))/2; + basis_x_eval[ipt + 3*npts] = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4; + basis_x_eval[ipt + 4*npts] = x*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; + basis_x_eval[ipt + 5*npts] = sqrt_10*z*(-radial_eval*(9*x*x + 3*y*y - 4*z*z) - radial_eval_alpha*x*x*(3*x*x + 3*y*y - 4*z*z))/4; + basis_x_eval[ipt + 6*npts] = sqrt_5*x*(-4*radial_eval*(x*x - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + basis_x_eval[ipt + 7*npts] = sqrt_70*z*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y))/4; + basis_x_eval[ipt + 8*npts] = sqrt_35*x*(4*radial_eval*(x*x - 3*y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + + // Evaluate first derivative of bfn wrt y + basis_y_eval[ipt + 0*npts] = sqrt_35*x*(-radial_eval*(-x*x + 3*y*y) + radial_eval_alpha*y*y*(x*x - y*y))/2; + basis_y_eval[ipt + 1*npts] = sqrt_70*z*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y))/4; + basis_y_eval[ipt + 2*npts] = sqrt_5*x*(-radial_eval*(x*x + 3*y*y - 6*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 6*z*z))/2; + basis_y_eval[ipt + 3*npts] = sqrt_10*z*(-radial_eval*(3*x*x + 9*y*y - 4*z*z) - radial_eval_alpha*y*y*(3*x*x + 3*y*y - 4*z*z))/4; + basis_y_eval[ipt + 4*npts] = y*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; + basis_y_eval[ipt + 5*npts] = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4; + basis_y_eval[ipt + 6*npts] = sqrt_5*y*(4*radial_eval*(y*y - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + basis_y_eval[ipt + 7*npts] = sqrt_70*x*y*z*(-6*radial_eval + radial_eval_alpha*(x*x - 3*y*y))/4; + basis_y_eval[ipt + 8*npts] = sqrt_35*y*(-4*radial_eval*(3*x*x - y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + + // Evaluate first derivative of bfn wrt z + basis_z_eval[ipt + 0*npts] = sqrt_35*radial_eval_alpha*x*y*z*(x*x - y*y)/2; + basis_z_eval[ipt + 1*npts] = sqrt_70*y*(radial_eval + radial_eval_alpha*z*z)*(3*x*x - y*y)/4; + basis_z_eval[ipt + 2*npts] = sqrt_5*x*y*z*(12*radial_eval - radial_eval_alpha*(x*x + y*y - 6*z*z))/2; + basis_z_eval[ipt + 3*npts] = sqrt_10*y*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4; + basis_z_eval[ipt + 4*npts] = z*(-16*radial_eval*(3*x*x + 3*y*y - 2*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; + basis_z_eval[ipt + 5*npts] = sqrt_10*x*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4; + basis_z_eval[ipt + 6*npts] = sqrt_5*z*(12*radial_eval*(x*x - y*y) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + basis_z_eval[ipt + 7*npts] = sqrt_70*x*(radial_eval + radial_eval_alpha*z*z)*(x*x - 3*y*y)/4; + basis_z_eval[ipt + 8*npts] = sqrt_35*radial_eval_alpha*z*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + + // Evaluate second derivative of bfn wrt xx + basis_xx_eval[ipt + 0*npts] = sqrt_35*x*y*(6*radial_eval + 2*radial_eval_alpha*(3*x*x - y*y) + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x - y*y))/2; + basis_xx_eval[ipt + 1*npts] = sqrt_70*y*z*(6*radial_eval + 12*radial_eval_alpha*x*x + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(3*x*x - y*y))/4; + basis_xx_eval[ipt + 2*npts] = sqrt_5*x*y*(-6*radial_eval - 2*radial_eval_alpha*(3*x*x + y*y - 6*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x + y*y - 6*z*z))/2; + basis_xx_eval[ipt + 3*npts] = sqrt_10*y*z*(-6*radial_eval - 12*radial_eval_alpha*x*x - (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(3*x*x + 3*y*y - 4*z*z))/4; + basis_xx_eval[ipt + 4*npts] = 3*radial_eval*(3*x*x + y*y - 4*z*z)/2 + 3*radial_eval_alpha*x*x*(x*x + y*y - 4*z*z) + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; + basis_xx_eval[ipt + 5*npts] = sqrt_10*x*z*(-18*radial_eval - 2*radial_eval_alpha*(9*x*x + 3*y*y - 4*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(3*x*x + 3*y*y - 4*z*z))/4; + basis_xx_eval[ipt + 6*npts] = sqrt_5*(-12*radial_eval*(x*x - z*z) - 8*radial_eval_alpha*x*x*(x*x - 3*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + basis_xx_eval[ipt + 7*npts] = sqrt_70*x*z*(6*radial_eval + 6*radial_eval_alpha*(x*x - y*y) + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x - 3*y*y))/4; + basis_xx_eval[ipt + 8*npts] = sqrt_35*(12*radial_eval*(x*x - y*y) + 8*radial_eval_alpha*x*x*(x*x - 3*y*y) + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + + // Evaluate second derivative of bfn wrt xy + basis_xy_eval[ipt + 0*npts] = sqrt_35*(3*radial_eval*x*x - 3*radial_eval*y*y + radial_eval_alpha*x*x*x*x - radial_eval_alpha*y*y*y*y + radial_eval_alpha_squared*x*x*x*x*y*y - radial_eval_alpha_squared*x*x*y*y*y*y)/2; + basis_xy_eval[ipt + 1*npts] = sqrt_70*x*z*(6*radial_eval + 3*radial_eval_alpha*x*x + 3*radial_eval_alpha*y*y + 3*radial_eval_alpha_squared*x*x*y*y - radial_eval_alpha_squared*y*y*y*y)/4; + basis_xy_eval[ipt + 2*npts] = sqrt_5*(-3*radial_eval*(x*x + y*y - 2*z*z) - radial_eval_alpha*x*x*(x*x + 3*y*y - 6*z*z) - radial_eval_alpha*y*y*(3*x*x + y*y - 6*z*z) - radial_eval_alpha_squared*x*x*y*y*(x*x + y*y - 6*z*z))/2; + basis_xy_eval[ipt + 3*npts] = sqrt_10*x*z*(-6*radial_eval - 6*radial_eval_alpha*y*y - radial_eval_alpha*(3*x*x + 9*y*y - 4*z*z) - radial_eval_alpha_squared*y*y*(3*x*x + 3*y*y - 4*z*z))/4; + basis_xy_eval[ipt + 4*npts] = x*y*(24*radial_eval + 24*radial_eval_alpha*(x*x + y*y - 4*z*z) + radial_eval_alpha_squared*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; + basis_xy_eval[ipt + 5*npts] = sqrt_10*y*z*(-6*radial_eval - 6*radial_eval_alpha*x*x - radial_eval_alpha*(9*x*x + 3*y*y - 4*z*z) - radial_eval_alpha_squared*x*x*(3*x*x + 3*y*y - 4*z*z))/4; + basis_xy_eval[ipt + 6*npts] = sqrt_5*x*y*(-4*radial_eval_alpha*x*x + 4*radial_eval_alpha*y*y - radial_eval_alpha_squared*x*x*x*x + 6*radial_eval_alpha_squared*x*x*z*z + radial_eval_alpha_squared*y*y*y*y - 6*radial_eval_alpha_squared*y*y*z*z)/4; + basis_xy_eval[ipt + 7*npts] = sqrt_70*y*z*(-6*radial_eval - 3*radial_eval_alpha*x*x - 3*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*x*x - 3*radial_eval_alpha_squared*x*x*y*y)/4; + basis_xy_eval[ipt + 8*npts] = sqrt_35*x*y*(-24*radial_eval - 8*radial_eval_alpha*x*x - 8*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*x*x - 6*radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*y*y*y*y)/8; + + // Evaluate second derivative of bfn wrt xz + basis_xz_eval[ipt + 0*npts] = sqrt_35*y*z*(radial_eval_alpha*(3*x*x - y*y) + radial_eval_alpha_squared*x*x*(x*x - y*y))/2; + basis_xz_eval[ipt + 1*npts] = sqrt_70*x*y*(6*radial_eval + 6*radial_eval_alpha*z*z + radial_eval_alpha*(3*x*x - y*y) + radial_eval_alpha_squared*z*z*(3*x*x - y*y))/4; + basis_xz_eval[ipt + 2*npts] = sqrt_5*y*z*(12*radial_eval + 12*radial_eval_alpha*x*x - radial_eval_alpha*(3*x*x + y*y - 6*z*z) - radial_eval_alpha_squared*x*x*(x*x + y*y - 6*z*z))/2; + basis_xz_eval[ipt + 3*npts] = sqrt_10*x*y*(-6*radial_eval - 6*radial_eval_alpha*z*z + 3*radial_eval_alpha*(-x*x - y*y + 4*z*z) - radial_eval_alpha_squared*z*z*(3*x*x + 3*y*y - 4*z*z))/4; + basis_xz_eval[ipt + 4*npts] = x*z*(-96*radial_eval - 36*radial_eval_alpha*x*x - 36*radial_eval_alpha*y*y - 16*radial_eval_alpha*z*z + 3*radial_eval_alpha_squared*x*x*x*x + 6*radial_eval_alpha_squared*x*x*y*y - 24*radial_eval_alpha_squared*x*x*z*z + 3*radial_eval_alpha_squared*y*y*y*y - 24*radial_eval_alpha_squared*y*y*z*z + 8*radial_eval_alpha_squared*z*z*z*z)/8; + basis_xz_eval[ipt + 5*npts] = sqrt_10*(-3*radial_eval*(3*x*x + y*y - 4*z*z) + 3*radial_eval_alpha*x*x*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(9*x*x + 3*y*y - 4*z*z) - radial_eval_alpha_squared*x*x*z*z*(3*x*x + 3*y*y - 4*z*z))/4; + basis_xz_eval[ipt + 6*npts] = sqrt_5*x*z*(24*radial_eval + 12*radial_eval_alpha*(x*x - y*y) - 4*radial_eval_alpha*(x*x - 3*z*z) - radial_eval_alpha_squared*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + basis_xz_eval[ipt + 7*npts] = sqrt_70*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y) + 3*radial_eval_alpha*z*z*(x*x - y*y) + radial_eval_alpha_squared*x*x*z*z*(x*x - 3*y*y))/4; + basis_xz_eval[ipt + 8*npts] = sqrt_35*x*z*(4*radial_eval_alpha*(x*x - 3*y*y) + radial_eval_alpha_squared*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + + // Evaluate second derivative of bfn wrt yy + basis_yy_eval[ipt + 0*npts] = sqrt_35*x*y*(-6*radial_eval - 2*radial_eval_alpha*(-x*x + 3*y*y) + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x - y*y))/2; + basis_yy_eval[ipt + 1*npts] = sqrt_70*y*z*(-6*radial_eval - 6*radial_eval_alpha*(-x*x + y*y) + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(3*x*x - y*y))/4; + basis_yy_eval[ipt + 2*npts] = sqrt_5*x*y*(-6*radial_eval - 2*radial_eval_alpha*(x*x + 3*y*y - 6*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x + y*y - 6*z*z))/2; + basis_yy_eval[ipt + 3*npts] = sqrt_10*y*z*(-18*radial_eval - 2*radial_eval_alpha*(3*x*x + 9*y*y - 4*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(3*x*x + 3*y*y - 4*z*z))/4; + basis_yy_eval[ipt + 4*npts] = 3*radial_eval*(x*x + 3*y*y - 4*z*z)/2 + 3*radial_eval_alpha*y*y*(x*x + y*y - 4*z*z) + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; + basis_yy_eval[ipt + 5*npts] = sqrt_10*x*z*(-6*radial_eval - 12*radial_eval_alpha*y*y - (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(3*x*x + 3*y*y - 4*z*z))/4; + basis_yy_eval[ipt + 6*npts] = sqrt_5*(12*radial_eval*(y*y - z*z) + 8*radial_eval_alpha*y*y*(y*y - 3*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + basis_yy_eval[ipt + 7*npts] = sqrt_70*x*z*(-6*radial_eval - 12*radial_eval_alpha*y*y + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x - 3*y*y))/4; + basis_yy_eval[ipt + 8*npts] = sqrt_35*(-12*radial_eval*(x*x - y*y) - 8*radial_eval_alpha*y*y*(3*x*x - y*y) + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + + // Evaluate second derivative of bfn wrt yz + basis_yz_eval[ipt + 0*npts] = sqrt_35*x*z*(-radial_eval_alpha*(-x*x + 3*y*y) + radial_eval_alpha_squared*y*y*(x*x - y*y))/2; + basis_yz_eval[ipt + 1*npts] = sqrt_70*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y) - 3*radial_eval_alpha*z*z*(-x*x + y*y) + radial_eval_alpha_squared*y*y*z*z*(3*x*x - y*y))/4; + basis_yz_eval[ipt + 2*npts] = sqrt_5*x*z*(12*radial_eval + 12*radial_eval_alpha*y*y - radial_eval_alpha*(x*x + 3*y*y - 6*z*z) - radial_eval_alpha_squared*y*y*(x*x + y*y - 6*z*z))/2; + basis_yz_eval[ipt + 3*npts] = sqrt_10*(-3*radial_eval*(x*x + 3*y*y - 4*z*z) + 3*radial_eval_alpha*y*y*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 9*y*y - 4*z*z) - radial_eval_alpha_squared*y*y*z*z*(3*x*x + 3*y*y - 4*z*z))/4; + basis_yz_eval[ipt + 4*npts] = y*z*(-96*radial_eval - 36*radial_eval_alpha*x*x - 36*radial_eval_alpha*y*y - 16*radial_eval_alpha*z*z + 3*radial_eval_alpha_squared*x*x*x*x + 6*radial_eval_alpha_squared*x*x*y*y - 24*radial_eval_alpha_squared*x*x*z*z + 3*radial_eval_alpha_squared*y*y*y*y - 24*radial_eval_alpha_squared*y*y*z*z + 8*radial_eval_alpha_squared*z*z*z*z)/8; + basis_yz_eval[ipt + 5*npts] = sqrt_10*x*y*(-6*radial_eval - 6*radial_eval_alpha*z*z + 3*radial_eval_alpha*(-x*x - y*y + 4*z*z) - radial_eval_alpha_squared*z*z*(3*x*x + 3*y*y - 4*z*z))/4; + basis_yz_eval[ipt + 6*npts] = sqrt_5*y*z*(-24*radial_eval + 12*radial_eval_alpha*(x*x - y*y) + 4*radial_eval_alpha*(y*y - 3*z*z) - radial_eval_alpha_squared*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + basis_yz_eval[ipt + 7*npts] = sqrt_70*x*y*(-6*radial_eval - 6*radial_eval_alpha*z*z + radial_eval_alpha*(x*x - 3*y*y) + radial_eval_alpha_squared*z*z*(x*x - 3*y*y))/4; + basis_yz_eval[ipt + 8*npts] = sqrt_35*y*z*(-4*radial_eval_alpha*(3*x*x - y*y) + radial_eval_alpha_squared*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + + // Evaluate second derivative of bfn wrt zz + basis_zz_eval[ipt + 0*npts] = sqrt_35*x*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z)*(x*x - y*y)/2; + basis_zz_eval[ipt + 1*npts] = sqrt_70*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z)*(3*x*x - y*y)/4; + basis_zz_eval[ipt + 2*npts] = sqrt_5*x*y*(12*radial_eval + 24*radial_eval_alpha*z*z - (radial_eval_alpha + radial_eval_alpha_squared*z*z)*(x*x + y*y - 6*z*z))/2; + basis_zz_eval[ipt + 3*npts] = sqrt_10*y*z*(24*radial_eval + 6*radial_eval_alpha*(-x*x - y*y + 4*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*z*z)*(3*x*x + 3*y*y - 4*z*z))/4; + basis_zz_eval[ipt + 4*npts] = -6*radial_eval*(x*x + y*y - 2*z*z) - 4*radial_eval_alpha*z*z*(3*x*x + 3*y*y - 2*z*z) + (radial_eval_alpha + radial_eval_alpha_squared*z*z)*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; + basis_zz_eval[ipt + 5*npts] = sqrt_10*x*z*(24*radial_eval + 6*radial_eval_alpha*(-x*x - y*y + 4*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*z*z)*(3*x*x + 3*y*y - 4*z*z))/4; + basis_zz_eval[ipt + 6*npts] = sqrt_5*(12*radial_eval*(x*x - y*y) + 24*radial_eval_alpha*z*z*(x*x - y*y) - (radial_eval_alpha + radial_eval_alpha_squared*z*z)*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + basis_zz_eval[ipt + 7*npts] = sqrt_70*x*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z)*(x*x - 3*y*y)/4; + basis_zz_eval[ipt + 8*npts] = sqrt_35*(radial_eval_alpha + radial_eval_alpha_squared*z*z)*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + + + + +#if 0 + // Evaluate the angular part of bfn + + + + double ang_eval_0; + double ang_eval_1; + double ang_eval_2; + double ang_eval_3; + + + ang_eval_0 = sqrt_35*radial_eval*x*y*(x*x - y*y)/2; + ang_eval_1 = sqrt_70*radial_eval*y*z*(3*x*x - y*y)/4; + ang_eval_2 = sqrt_5*radial_eval*x*y*(-x*x - y*y + 6*z*z)/2; + ang_eval_3 = sqrt_10*radial_eval*y*z*(-3*x*x - 3*y*y + 4*z*z)/4; + basis_eval[ipt + 0*npts] = ang_eval_0; + basis_eval[ipt + 1*npts] = ang_eval_1; + basis_eval[ipt + 2*npts] = ang_eval_2; + basis_eval[ipt + 3*npts] = ang_eval_3; + + ang_eval_0 = radial_eval*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; + ang_eval_1 = sqrt_10*radial_eval*x*z*(-3*x*x - 3*y*y + 4*z*z)/4; + ang_eval_2 = sqrt_5*radial_eval*(-x*x*x*x + 6*x*x*z*z + y*y*y*y - 6*y*y*z*z)/4; + ang_eval_3 = sqrt_70*radial_eval*x*z*(x*x - 3*y*y)/4; + basis_eval[ipt + 4*npts] = ang_eval_0; + basis_eval[ipt + 5*npts] = ang_eval_1; + basis_eval[ipt + 6*npts] = ang_eval_2; + basis_eval[ipt + 7*npts] = ang_eval_3; + + ang_eval_0 = sqrt_35*radial_eval*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + basis_eval[ipt + 8*npts] = ang_eval_0; + + + double dang_eval_x_0, dang_eval_y_0, dang_eval_z_0; + double dang_eval_x_1, dang_eval_y_1, dang_eval_z_1; + double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; + double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3; + + dang_eval_x_0 = sqrt_35*y*(radial_eval*(3*x*x - y*y) + radial_eval_alpha*x*x*(x*x - y*y))/2; + dang_eval_y_0 = sqrt_35*x*(-radial_eval*(-x*x + 3*y*y) + radial_eval_alpha*y*y*(x*x - y*y))/2; + dang_eval_z_0 = sqrt_35*radial_eval_alpha*x*y*z*(x*x - y*y)/2; + dang_eval_x_1 = sqrt_70*x*y*z*(6*radial_eval + radial_eval_alpha*(3*x*x - y*y))/4; + dang_eval_y_1 = sqrt_70*z*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y))/4; + dang_eval_z_1 = sqrt_70*y*(radial_eval + radial_eval_alpha*z*z)*(3*x*x - y*y)/4; + dang_eval_x_2 = sqrt_5*y*(-radial_eval*(3*x*x + y*y - 6*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 6*z*z))/2; + dang_eval_y_2 = sqrt_5*x*(-radial_eval*(x*x + 3*y*y - 6*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 6*z*z))/2; + dang_eval_z_2 = sqrt_5*x*y*z*(12*radial_eval - radial_eval_alpha*(x*x + y*y - 6*z*z))/2; + dang_eval_x_3 = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4; + dang_eval_y_3 = sqrt_10*z*(-radial_eval*(3*x*x + 9*y*y - 4*z*z) - radial_eval_alpha*y*y*(3*x*x + 3*y*y - 4*z*z))/4; + dang_eval_z_3 = sqrt_10*y*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4; + basis_x_eval[ipt + 0*npts] = dang_eval_x_0; + basis_y_eval[ipt + 0*npts] = dang_eval_y_0; + basis_z_eval[ipt + 0*npts] = dang_eval_z_0; + basis_x_eval[ipt + 1*npts] = dang_eval_x_1; + basis_y_eval[ipt + 1*npts] = dang_eval_y_1; + basis_z_eval[ipt + 1*npts] = dang_eval_z_1; + basis_x_eval[ipt + 2*npts] = dang_eval_x_2; + basis_y_eval[ipt + 2*npts] = dang_eval_y_2; + basis_z_eval[ipt + 2*npts] = dang_eval_z_2; + basis_x_eval[ipt + 3*npts] = dang_eval_x_3; + basis_y_eval[ipt + 3*npts] = dang_eval_y_3; + basis_z_eval[ipt + 3*npts] = dang_eval_z_3; + + dang_eval_x_0 = x*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; + dang_eval_y_0 = y*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; + dang_eval_z_0 = z*(-16*radial_eval*(3*x*x + 3*y*y - 2*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; + dang_eval_x_1 = sqrt_10*z*(-radial_eval*(9*x*x + 3*y*y - 4*z*z) - radial_eval_alpha*x*x*(3*x*x + 3*y*y - 4*z*z))/4; + dang_eval_y_1 = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4; + dang_eval_z_1 = sqrt_10*x*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4; + dang_eval_x_2 = sqrt_5*x*(-4*radial_eval*(x*x - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + dang_eval_y_2 = sqrt_5*y*(4*radial_eval*(y*y - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + dang_eval_z_2 = sqrt_5*z*(12*radial_eval*(x*x - y*y) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + dang_eval_x_3 = sqrt_70*z*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y))/4; + dang_eval_y_3 = sqrt_70*x*y*z*(-6*radial_eval + radial_eval_alpha*(x*x - 3*y*y))/4; + dang_eval_z_3 = sqrt_70*x*(radial_eval + radial_eval_alpha*z*z)*(x*x - 3*y*y)/4; + basis_x_eval[ipt + 4*npts] = dang_eval_x_0; + basis_y_eval[ipt + 4*npts] = dang_eval_y_0; + basis_z_eval[ipt + 4*npts] = dang_eval_z_0; + basis_x_eval[ipt + 5*npts] = dang_eval_x_1; + basis_y_eval[ipt + 5*npts] = dang_eval_y_1; + basis_z_eval[ipt + 5*npts] = dang_eval_z_1; + basis_x_eval[ipt + 6*npts] = dang_eval_x_2; + basis_y_eval[ipt + 6*npts] = dang_eval_y_2; + basis_z_eval[ipt + 6*npts] = dang_eval_z_2; + basis_x_eval[ipt + 7*npts] = dang_eval_x_3; + basis_y_eval[ipt + 7*npts] = dang_eval_y_3; + basis_z_eval[ipt + 7*npts] = dang_eval_z_3; + + dang_eval_x_0 = sqrt_35*x*(4*radial_eval*(x*x - 3*y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + dang_eval_y_0 = sqrt_35*y*(-4*radial_eval*(3*x*x - y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + dang_eval_z_0 = sqrt_35*radial_eval_alpha*z*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + basis_x_eval[ipt + 8*npts] = dang_eval_x_0; + basis_y_eval[ipt + 8*npts] = dang_eval_y_0; + basis_z_eval[ipt + 8*npts] = dang_eval_z_0; + +#endif + } // Loop over points within task + } // Loop over tasks + + } // Loop over shells +} // end kernel + +} // namespace GauXC diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_device.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_device.cu index bb945bab..ab8e5c70 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_device.cu +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_device.cu @@ -17,6 +17,8 @@ #include "device_specific/cuda_device_constants.hpp" +#define GAUXC_CUDA_MAX_L 4 + namespace GauXC { @@ -254,13 +256,16 @@ uint32_t max_threads_shell_to_task_collocation( int32_t l, bool pure ) { case 1: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_1 ); case 2: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_2 ); case 3: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_3 ); + case 4: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_4 ); + default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4"); } } else { switch(l) { case 0: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_0 ); case 1: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_1 ); case 2: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_2 ); - case 3: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_3 ); + case 4: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_4 ); + default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4"); } } return 0; @@ -289,6 +294,10 @@ void dispatch_shell_to_task_collocation( cudaStream_t stream, int32_t l, case 3: collocation_device_shell_to_task_kernel_spherical_3<<>>( nshells, std::forward(args)... ); break; + case 4: + collocation_device_shell_to_task_kernel_spherical_4<<>>( nshells, std::forward(args)... ); + break; + default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4"); } } else { switch(l) { @@ -304,6 +313,10 @@ void dispatch_shell_to_task_collocation( cudaStream_t stream, int32_t l, case 3: collocation_device_shell_to_task_kernel_cartesian_3<<>>( nshells, std::forward(args)... ); break; + case 4: + collocation_device_shell_to_task_kernel_cartesian_4<<>>( nshells, std::forward(args)... ); + break; + default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4"); } } } @@ -338,6 +351,8 @@ uint32_t max_threads_shell_to_task_collocation_gradient( int32_t l, bool pure ) case 1: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_gradient_1 ); case 2: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_gradient_2 ); case 3: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_gradient_3 ); + case 4: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_gradient_4 ); + default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4"); } } else { switch(l) { @@ -345,6 +360,8 @@ uint32_t max_threads_shell_to_task_collocation_gradient( int32_t l, bool pure ) case 1: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_gradient_1 ); case 2: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_gradient_2 ); case 3: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_gradient_3 ); + case 4: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_gradient_4 ); + default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4"); } } return 0; @@ -373,6 +390,10 @@ void dispatch_shell_to_task_collocation_gradient( cudaStream_t stream, int32_t l case 3: collocation_device_shell_to_task_kernel_spherical_gradient_3<<>>( nshells, std::forward(args)... ); break; + case 4: + collocation_device_shell_to_task_kernel_spherical_gradient_4<<>>( nshells, std::forward(args)... ); + break; + default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4"); } } else { switch(l) { @@ -388,6 +409,10 @@ void dispatch_shell_to_task_collocation_gradient( cudaStream_t stream, int32_t l case 3: collocation_device_shell_to_task_kernel_cartesian_gradient_3<<>>( nshells, std::forward(args)... ); break; + case 4: + collocation_device_shell_to_task_kernel_cartesian_gradient_4<<>>( nshells, std::forward(args)... ); + break; + default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4"); } } @@ -423,6 +448,8 @@ uint32_t max_threads_shell_to_task_collocation_hessian( int32_t l, bool pure ) { case 1: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_hessian_1 ); case 2: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_hessian_2 ); case 3: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_hessian_3 ); + case 4: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_hessian_4 ); + default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4"); } } else { switch(l) { @@ -430,6 +457,8 @@ uint32_t max_threads_shell_to_task_collocation_hessian( int32_t l, bool pure ) { case 1: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_hessian_1 ); case 2: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_hessian_2 ); case 3: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_hessian_3 ); + case 4: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_hessian_4 ); + default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4"); } } return 0; @@ -458,6 +487,10 @@ void dispatch_shell_to_task_collocation_hessian( cudaStream_t stream, int32_t l, case 3: collocation_device_shell_to_task_kernel_spherical_hessian_3<<>>( nshells, std::forward(args)... ); break; + case 4: + collocation_device_shell_to_task_kernel_spherical_hessian_4<<>>( nshells, std::forward(args)... ); + break; + default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4"); } } else { switch(l) { @@ -473,6 +506,10 @@ void dispatch_shell_to_task_collocation_hessian( cudaStream_t stream, int32_t l, case 3: collocation_device_shell_to_task_kernel_cartesian_hessian_3<<>>( nshells, std::forward(args)... ); break; + case 4: + collocation_device_shell_to_task_kernel_cartesian_hessian_4<<>>( nshells, std::forward(args)... ); + break; + default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4"); } } diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_shell_to_task_kernels.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_shell_to_task_kernels.hpp index 5d2d3e5a..b2848053 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_shell_to_task_kernels.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_shell_to_task_kernels.hpp @@ -11,28 +11,34 @@ #include "collocation/collocation_shell_to_task_kernels_cartesian_l1.hpp" #include "collocation/collocation_shell_to_task_kernels_cartesian_l2.hpp" #include "collocation/collocation_shell_to_task_kernels_cartesian_l3.hpp" +#include "collocation/collocation_shell_to_task_kernels_cartesian_l4.hpp" #include "collocation/collocation_shell_to_task_kernels_cartesian_l0_gradient.hpp" #include "collocation/collocation_shell_to_task_kernels_cartesian_l1_gradient.hpp" #include "collocation/collocation_shell_to_task_kernels_cartesian_l2_gradient.hpp" #include "collocation/collocation_shell_to_task_kernels_cartesian_l3_gradient.hpp" +#include "collocation/collocation_shell_to_task_kernels_cartesian_l4_gradient.hpp" #include "collocation/collocation_shell_to_task_kernels_cartesian_l0_hessian.hpp" #include "collocation/collocation_shell_to_task_kernels_cartesian_l1_hessian.hpp" #include "collocation/collocation_shell_to_task_kernels_cartesian_l2_hessian.hpp" #include "collocation/collocation_shell_to_task_kernels_cartesian_l3_hessian.hpp" +#include "collocation/collocation_shell_to_task_kernels_cartesian_l4_hessian.hpp" #include "collocation/collocation_shell_to_task_kernels_spherical_l0.hpp" #include "collocation/collocation_shell_to_task_kernels_spherical_l1.hpp" #include "collocation/collocation_shell_to_task_kernels_spherical_l2.hpp" #include "collocation/collocation_shell_to_task_kernels_spherical_l3.hpp" +#include "collocation/collocation_shell_to_task_kernels_spherical_l4.hpp" #include "collocation/collocation_shell_to_task_kernels_spherical_l0_gradient.hpp" #include "collocation/collocation_shell_to_task_kernels_spherical_l1_gradient.hpp" #include "collocation/collocation_shell_to_task_kernels_spherical_l2_gradient.hpp" #include "collocation/collocation_shell_to_task_kernels_spherical_l3_gradient.hpp" +#include "collocation/collocation_shell_to_task_kernels_spherical_l4_gradient.hpp" #include "collocation/collocation_shell_to_task_kernels_spherical_l0_hessian.hpp" #include "collocation/collocation_shell_to_task_kernels_spherical_l1_hessian.hpp" #include "collocation/collocation_shell_to_task_kernels_spherical_l2_hessian.hpp" #include "collocation/collocation_shell_to_task_kernels_spherical_l3_hessian.hpp" +#include "collocation/collocation_shell_to_task_kernels_spherical_l4_hessian.hpp" diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3.cxx index 961c3bc1..cb1c330a 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3.cxx @@ -1219,118 +1219,907 @@ void integral_3(size_t npts, double *Xik = (Xi + p_outer + p_inner); double *Gik = (Gi + p_outer + p_inner); - for(int c0 = 0; c0 <= 3; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 3 - c0; - int p = c1; - - int idxB = (((3 - m) * (3 - m + 1)) >> 1) + p; - - int mv, pv; - - SIMD_TYPE tx, wg, xik, gik; - mv = 3 + m; pv = 0 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); - mv = 2 + m; pv = 0 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); - mv = 2 + m; pv = 1 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); - mv = 1 + m; pv = 0 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); - mv = 1 + m; pv = 1 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); - mv = 1 + m; pv = 2 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); - mv = 0 + m; pv = 0 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); - mv = 0 + m; pv = 1 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); - mv = 0 + m; pv = 2 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); - mv = 0 + m; pv = 3 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); - } - } + SIMD_TYPE tx, wg, xik, gik; + tx = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); } } @@ -3680,236 +4469,1814 @@ void integral_3(size_t npts, double *Xik = (Xi + p_outer + p_inner); double *Gik = (Gi + p_outer + p_inner); - for(int c0 = 0; c0 <= 3; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 3 - c0; - int p = c1; - - int idxB = (((3 - m) * (3 - m + 1)) >> 1) + p; - - int mv, pv; - - SIMD_TYPE tx, wg, xik, gik; - mv = 3 + m; pv = 0 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); - mv = 2 + m; pv = 0 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); - mv = 2 + m; pv = 1 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); - mv = 1 + m; pv = 0 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); - mv = 1 + m; pv = 1 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); - mv = 1 + m; pv = 2 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); - mv = 0 + m; pv = 0 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); - mv = 0 + m; pv = 1 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); - mv = 0 + m; pv = 2 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); - mv = 0 + m; pv = 3 + p; - tx = SIMD_ALIGNED_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); - } - } - } + SIMD_TYPE tx, wg, xik, gik; + tx = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - for(; p_inner < npts_inner; p_inner += SCALAR_LENGTH) { - double *Xik = (Xi + p_outer + p_inner); - double *Gik = (Gi + p_outer + p_inner); + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); - for(int c0 = 0; c0 <= 3; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 3 - c0; - int p = c1; - - int idxB = (((3 - m) * (3 - m + 1)) >> 1) + p; - - int mv, pv; - - SCALAR_TYPE tx, wg, xik, gik; - mv = 3 + m; pv = 0 + p; - tx = SCALAR_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 0 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 0 * ldG), gik); - mv = 2 + m; pv = 0 + p; - tx = SCALAR_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 1 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 1 * ldG), gik); - mv = 2 + m; pv = 1 + p; - tx = SCALAR_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 2 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 2 * ldG), gik); - mv = 1 + m; pv = 0 + p; - tx = SCALAR_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 3 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 3 * ldG), gik); - mv = 1 + m; pv = 1 + p; - tx = SCALAR_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 4 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 4 * ldG), gik); - mv = 1 + m; pv = 2 + p; - tx = SCALAR_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 5 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 5 * ldG), gik); - mv = 0 + m; pv = 0 + p; - tx = SCALAR_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 6 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 6 * ldG), gik); - mv = 0 + m; pv = 1 + p; - tx = SCALAR_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 7 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 7 * ldG), gik); - mv = 0 + m; pv = 2 + p; - tx = SCALAR_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 8 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 8 * ldG), gik); - mv = 0 + m; pv = 3 + p; - tx = SCALAR_LOAD((temp + (46 + (((6 - mv) * (6 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 9 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 9 * ldG), gik); - } - } + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + } + + for(; p_inner < npts_inner; p_inner += SCALAR_LENGTH) { + double *Xik = (Xi + p_outer + p_inner); + double *Gik = (Gi + p_outer + p_inner); + + SCALAR_TYPE tx, wg, xik, gik; + tx = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); } } } diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_2.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_2.cxx index f43292cd..f34e192a 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_2.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_2.cxx @@ -756,162 +756,2198 @@ void integral_3_2(size_t npts, SIMD_TYPE const_value_v = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - for(int c0 = 0; c0 <= 2; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 2 - c0; - int n = c0 - c1; - int p = c1; + double const_value, X_ABp, Y_ABp, Z_ABp, comb_m_i, comb_n_j, comb_p_k; + SIMD_TYPE const_value_w; + SIMD_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9; - int idxB = (((2 - m) * (2 - m + 1)) >> 1) + p; - - double X_ABp = 1.0, comb_m_i = 1.0; - for(int i = 0; i <= m; ++i) { - double rcp_i; - - double Y_ABp = 1.0, comb_n_j = 1.0; - for(int j = 0; j <= n; ++j) { - double rcp_j; - - double Z_ABp = 1.0, comb_p_k = 1.0; - for(int k = 0; k <= p; ++k) { - double rcp_k; - int mv, pv, Lv = 5 - i - j - k; - - int offset = (Lv * (Lv + 1) * (Lv + 2) - 60) / 6; - double const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; - SIMD_TYPE tx, ty, tz, tw; - SIMD_TYPE const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); - - mv = 3 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t0 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t0 = SIMD_MUL(t0, const_value_w); - tz = SIMD_FMA(ty, t0, tz); - tw = SIMD_FMA(tx, t0, tw); - SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t1 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t1 = SIMD_MUL(t1, const_value_w); - tz = SIMD_FMA(ty, t1, tz); - tw = SIMD_FMA(tx, t1, tw); - SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t2 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t2 = SIMD_MUL(t2, const_value_w); - tz = SIMD_FMA(ty, t2, tz); - tw = SIMD_FMA(tx, t2, tw); - SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t3 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t3 = SIMD_MUL(t3, const_value_w); - tz = SIMD_FMA(ty, t3, tz); - tw = SIMD_FMA(tx, t3, tw); - SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t4 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t4 = SIMD_MUL(t4, const_value_w); - tz = SIMD_FMA(ty, t4, tz); - tw = SIMD_FMA(tx, t4, tw); - SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t5 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t5 = SIMD_MUL(t5, const_value_w); - tz = SIMD_FMA(ty, t5, tz); - tw = SIMD_FMA(tx, t5, tw); - SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t6 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t6 = SIMD_MUL(t6, const_value_w); - tz = SIMD_FMA(ty, t6, tz); - tw = SIMD_FMA(tx, t6, tw); - SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t7 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t7 = SIMD_MUL(t7, const_value_w); - tz = SIMD_FMA(ty, t7, tz); - tw = SIMD_FMA(tx, t7, tw); - SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t8 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t8 = SIMD_MUL(t8, const_value_w); - tz = SIMD_FMA(ty, t8, tz); - tw = SIMD_FMA(tx, t8, tw); - SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t9 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t9 = SIMD_MUL(t9, const_value_w); - tz = SIMD_FMA(ty, t9, tz); - tw = SIMD_FMA(tx, t9, tw); - SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - - Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); - rcp_k = SCALAR_RECIPROCAL(k + 1); - comb_p_k = SCALAR_MUL(comb_p_k, p - k); - comb_p_k = SCALAR_MUL(comb_p_k, rcp_k); - } - - Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); - rcp_j = SCALAR_RECIPROCAL(j + 1); - comb_n_j = SCALAR_MUL(comb_n_j, n - j); - comb_n_j = SCALAR_MUL(comb_n_j, rcp_j); - } - - X_ABp = SCALAR_MUL(X_ABp, X_AB); - rcp_i = SCALAR_RECIPROCAL(i + 1); - comb_m_i = SCALAR_MUL(comb_m_i, m - i); - comb_m_i = SCALAR_MUL(comb_m_i, rcp_i); - } - } - } + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); } } @@ -2323,162 +4359,2198 @@ void integral_3_2(size_t npts, SIMD_TYPE const_value_v = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - for(int c0 = 0; c0 <= 2; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 2 - c0; - int n = c0 - c1; - int p = c1; - - int idxB = (((2 - m) * (2 - m + 1)) >> 1) + p; - - double X_ABp = 1.0, comb_m_i = 1.0; - for(int i = 0; i <= m; ++i) { - double rcp_i; - - double Y_ABp = 1.0, comb_n_j = 1.0; - for(int j = 0; j <= n; ++j) { - double rcp_j; - - double Z_ABp = 1.0, comb_p_k = 1.0; - for(int k = 0; k <= p; ++k) { - double rcp_k; - int mv, pv, Lv = 5 - i - j - k; - - int offset = (Lv * (Lv + 1) * (Lv + 2) - 60) / 6; - double const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; - SIMD_TYPE tx, ty, tz, tw; - SIMD_TYPE const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + double const_value, X_ABp, Y_ABp, Z_ABp, comb_m_i, comb_n_j, comb_p_k; + SIMD_TYPE const_value_w; + SIMD_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9; - mv = 3 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t0 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t0 = SIMD_MUL(t0, const_value_w); - tz = SIMD_FMA(ty, t0, tz); - tw = SIMD_FMA(tx, t0, tw); - SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t1 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t1 = SIMD_MUL(t1, const_value_w); - tz = SIMD_FMA(ty, t1, tz); - tw = SIMD_FMA(tx, t1, tw); - SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t2 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t2 = SIMD_MUL(t2, const_value_w); - tz = SIMD_FMA(ty, t2, tz); - tw = SIMD_FMA(tx, t2, tw); - SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t3 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t3 = SIMD_MUL(t3, const_value_w); - tz = SIMD_FMA(ty, t3, tz); - tw = SIMD_FMA(tx, t3, tw); - SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t4 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t4 = SIMD_MUL(t4, const_value_w); - tz = SIMD_FMA(ty, t4, tz); - tw = SIMD_FMA(tx, t4, tw); - SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t5 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t5 = SIMD_MUL(t5, const_value_w); - tz = SIMD_FMA(ty, t5, tz); - tw = SIMD_FMA(tx, t5, tw); - SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t6 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t6 = SIMD_MUL(t6, const_value_w); - tz = SIMD_FMA(ty, t6, tz); - tw = SIMD_FMA(tx, t6, tw); - SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t7 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t7 = SIMD_MUL(t7, const_value_w); - tz = SIMD_FMA(ty, t7, tz); - tw = SIMD_FMA(tx, t7, tw); - SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t8 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t8 = SIMD_MUL(t8, const_value_w); - tz = SIMD_FMA(ty, t8, tz); - tw = SIMD_FMA(tx, t8, tw); - SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t9 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t9 = SIMD_MUL(t9, const_value_w); - tz = SIMD_FMA(ty, t9, tz); - tw = SIMD_FMA(tx, t9, tw); - SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - - Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); - rcp_k = SCALAR_RECIPROCAL(k + 1); - comb_p_k = SCALAR_MUL(comb_p_k, p - k); - comb_p_k = SCALAR_MUL(comb_p_k, rcp_k); - } - - Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); - rcp_j = SCALAR_RECIPROCAL(j + 1); - comb_n_j = SCALAR_MUL(comb_n_j, n - j); - comb_n_j = SCALAR_MUL(comb_n_j, rcp_j); - } - - X_ABp = SCALAR_MUL(X_ABp, X_AB); - rcp_i = SCALAR_RECIPROCAL(i + 1); - comb_m_i = SCALAR_MUL(comb_m_i, m - i); - comb_m_i = SCALAR_MUL(comb_m_i, rcp_i); - } - } - } + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); } for(; p_inner < npts_inner; p_inner += SCALAR_LENGTH) { @@ -2489,162 +6561,2198 @@ void integral_3_2(size_t npts, SCALAR_TYPE const_value_v = SCALAR_LOAD((weights + p_outer + p_inner)); - for(int c0 = 0; c0 <= 2; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 2 - c0; - int n = c0 - c1; - int p = c1; - - int idxB = (((2 - m) * (2 - m + 1)) >> 1) + p; - - double X_ABp = 1.0, comb_m_i = 1.0; - for(int i = 0; i <= m; ++i) { - double rcp_i; - - double Y_ABp = 1.0, comb_n_j = 1.0; - for(int j = 0; j <= n; ++j) { - double rcp_j; + double const_value, X_ABp, Y_ABp, Z_ABp, comb_m_i, comb_n_j, comb_p_k; + SCALAR_TYPE const_value_w; + SCALAR_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9; - double Z_ABp = 1.0, comb_p_k = 1.0; - for(int k = 0; k <= p; ++k) { - double rcp_k; - int mv, pv, Lv = 5 - i - j - k; - - int offset = (Lv * (Lv + 1) * (Lv + 2) - 60) / 6; - double const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; - SCALAR_TYPE tx, ty, tz, tw; - SCALAR_TYPE const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); - - mv = 3 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 0 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 0 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t0 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t0 = SCALAR_MUL(t0, const_value_w); - tz = SCALAR_FMA(ty, t0, tz); - tw = SCALAR_FMA(tx, t0, tw); - SCALAR_STORE((Gik + 0 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 1 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 1 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t1 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t1 = SCALAR_MUL(t1, const_value_w); - tz = SCALAR_FMA(ty, t1, tz); - tw = SCALAR_FMA(tx, t1, tw); - SCALAR_STORE((Gik + 1 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 2 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 2 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t2 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t2 = SCALAR_MUL(t2, const_value_w); - tz = SCALAR_FMA(ty, t2, tz); - tw = SCALAR_FMA(tx, t2, tw); - SCALAR_STORE((Gik + 2 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 3 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 3 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t3 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t3 = SCALAR_MUL(t3, const_value_w); - tz = SCALAR_FMA(ty, t3, tz); - tw = SCALAR_FMA(tx, t3, tw); - SCALAR_STORE((Gik + 3 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 4 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 4 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t4 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t4 = SCALAR_MUL(t4, const_value_w); - tz = SCALAR_FMA(ty, t4, tz); - tw = SCALAR_FMA(tx, t4, tw); - SCALAR_STORE((Gik + 4 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 2 + p - k; - tx = SCALAR_LOAD((Xik + 5 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 5 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t5 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t5 = SCALAR_MUL(t5, const_value_w); - tz = SCALAR_FMA(ty, t5, tz); - tw = SCALAR_FMA(tx, t5, tw); - SCALAR_STORE((Gik + 5 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 6 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 6 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t6 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t6 = SCALAR_MUL(t6, const_value_w); - tz = SCALAR_FMA(ty, t6, tz); - tw = SCALAR_FMA(tx, t6, tw); - SCALAR_STORE((Gik + 6 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 7 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 7 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t7 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t7 = SCALAR_MUL(t7, const_value_w); - tz = SCALAR_FMA(ty, t7, tz); - tw = SCALAR_FMA(tx, t7, tw); - SCALAR_STORE((Gik + 7 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 2 + p - k; - tx = SCALAR_LOAD((Xik + 8 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 8 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t8 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t8 = SCALAR_MUL(t8, const_value_w); - tz = SCALAR_FMA(ty, t8, tz); - tw = SCALAR_FMA(tx, t8, tw); - SCALAR_STORE((Gik + 8 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 3 + p - k; - tx = SCALAR_LOAD((Xik + 9 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 9 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t9 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t9 = SCALAR_MUL(t9, const_value_w); - tz = SCALAR_FMA(ty, t9, tz); - tw = SCALAR_FMA(tx, t9, tw); - SCALAR_STORE((Gik + 9 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - - Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); - rcp_k = SCALAR_RECIPROCAL(k + 1); - comb_p_k = SCALAR_MUL(comb_p_k, p - k); - comb_p_k = SCALAR_MUL(comb_p_k, rcp_k); - } - - Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); - rcp_j = SCALAR_RECIPROCAL(j + 1); - comb_n_j = SCALAR_MUL(comb_n_j, n - j); - comb_n_j = SCALAR_MUL(comb_n_j, rcp_j); - } - - X_ABp = SCALAR_MUL(X_ABp, X_AB); - rcp_i = SCALAR_RECIPROCAL(i + 1); - comb_m_i = SCALAR_MUL(comb_m_i, m - i); - comb_m_i = SCALAR_MUL(comb_m_i, rcp_i); - } - } - } + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); } } } diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_3.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_3.cxx index f78c490d..09222e56 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_3.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_3.cxx @@ -1228,162 +1228,5833 @@ void integral_3_3(size_t npts, SIMD_TYPE const_value_v = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - for(int c0 = 0; c0 <= 3; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 3 - c0; - int n = c0 - c1; - int p = c1; + double const_value, X_ABp, Y_ABp, Z_ABp, comb_m_i, comb_n_j, comb_p_k; + SIMD_TYPE const_value_w; + SIMD_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9; - int idxB = (((3 - m) * (3 - m + 1)) >> 1) + p; - - double X_ABp = 1.0, comb_m_i = 1.0; - for(int i = 0; i <= m; ++i) { - double rcp_i; - - double Y_ABp = 1.0, comb_n_j = 1.0; - for(int j = 0; j <= n; ++j) { - double rcp_j; - - double Z_ABp = 1.0, comb_p_k = 1.0; - for(int k = 0; k <= p; ++k) { - double rcp_k; - int mv, pv, Lv = 6 - i - j - k; - - int offset = (Lv * (Lv + 1) * (Lv + 2) - 60) / 6; - double const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; - SIMD_TYPE tx, ty, tz, tw; - SIMD_TYPE const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); - - mv = 3 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t0 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t0 = SIMD_MUL(t0, const_value_w); - tz = SIMD_FMA(ty, t0, tz); - tw = SIMD_FMA(tx, t0, tw); - SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t1 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t1 = SIMD_MUL(t1, const_value_w); - tz = SIMD_FMA(ty, t1, tz); - tw = SIMD_FMA(tx, t1, tw); - SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t2 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t2 = SIMD_MUL(t2, const_value_w); - tz = SIMD_FMA(ty, t2, tz); - tw = SIMD_FMA(tx, t2, tw); - SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t3 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t3 = SIMD_MUL(t3, const_value_w); - tz = SIMD_FMA(ty, t3, tz); - tw = SIMD_FMA(tx, t3, tw); - SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t4 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t4 = SIMD_MUL(t4, const_value_w); - tz = SIMD_FMA(ty, t4, tz); - tw = SIMD_FMA(tx, t4, tw); - SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t5 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t5 = SIMD_MUL(t5, const_value_w); - tz = SIMD_FMA(ty, t5, tz); - tw = SIMD_FMA(tx, t5, tw); - SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t6 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t6 = SIMD_MUL(t6, const_value_w); - tz = SIMD_FMA(ty, t6, tz); - tw = SIMD_FMA(tx, t6, tw); - SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t7 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t7 = SIMD_MUL(t7, const_value_w); - tz = SIMD_FMA(ty, t7, tz); - tw = SIMD_FMA(tx, t7, tw); - SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t8 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t8 = SIMD_MUL(t8, const_value_w); - tz = SIMD_FMA(ty, t8, tz); - tw = SIMD_FMA(tx, t8, tw); - SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t9 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t9 = SIMD_MUL(t9, const_value_w); - tz = SIMD_FMA(ty, t9, tz); - tw = SIMD_FMA(tx, t9, tw); - SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - - Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); - rcp_k = SCALAR_RECIPROCAL(k + 1); - comb_p_k = SCALAR_MUL(comb_p_k, p - k); - comb_p_k = SCALAR_MUL(comb_p_k, rcp_k); - } - - Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); - rcp_j = SCALAR_RECIPROCAL(j + 1); - comb_n_j = SCALAR_MUL(comb_n_j, n - j); - comb_n_j = SCALAR_MUL(comb_n_j, rcp_j); - } - - X_ABp = SCALAR_MUL(X_ABp, X_AB); - rcp_i = SCALAR_RECIPROCAL(i + 1); - comb_m_i = SCALAR_MUL(comb_m_i, m - i); - comb_m_i = SCALAR_MUL(comb_m_i, rcp_i); - } - } - } + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 3, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(3)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 3, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(3)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); } } @@ -3739,162 +9410,5833 @@ void integral_3_3(size_t npts, SIMD_TYPE const_value_v = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - for(int c0 = 0; c0 <= 3; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 3 - c0; - int n = c0 - c1; - int p = c1; - - int idxB = (((3 - m) * (3 - m + 1)) >> 1) + p; - - double X_ABp = 1.0, comb_m_i = 1.0; - for(int i = 0; i <= m; ++i) { - double rcp_i; - - double Y_ABp = 1.0, comb_n_j = 1.0; - for(int j = 0; j <= n; ++j) { - double rcp_j; - - double Z_ABp = 1.0, comb_p_k = 1.0; - for(int k = 0; k <= p; ++k) { - double rcp_k; - int mv, pv, Lv = 6 - i - j - k; - - int offset = (Lv * (Lv + 1) * (Lv + 2) - 60) / 6; - double const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; - SIMD_TYPE tx, ty, tz, tw; - SIMD_TYPE const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + double const_value, X_ABp, Y_ABp, Z_ABp, comb_m_i, comb_n_j, comb_p_k; + SIMD_TYPE const_value_w; + SIMD_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9; - mv = 3 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t0 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t0 = SIMD_MUL(t0, const_value_w); - tz = SIMD_FMA(ty, t0, tz); - tw = SIMD_FMA(tx, t0, tw); - SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t1 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t1 = SIMD_MUL(t1, const_value_w); - tz = SIMD_FMA(ty, t1, tz); - tw = SIMD_FMA(tx, t1, tw); - SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t2 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t2 = SIMD_MUL(t2, const_value_w); - tz = SIMD_FMA(ty, t2, tz); - tw = SIMD_FMA(tx, t2, tw); - SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t3 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t3 = SIMD_MUL(t3, const_value_w); - tz = SIMD_FMA(ty, t3, tz); - tw = SIMD_FMA(tx, t3, tw); - SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t4 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t4 = SIMD_MUL(t4, const_value_w); - tz = SIMD_FMA(ty, t4, tz); - tw = SIMD_FMA(tx, t4, tw); - SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t5 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t5 = SIMD_MUL(t5, const_value_w); - tz = SIMD_FMA(ty, t5, tz); - tw = SIMD_FMA(tx, t5, tw); - SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t6 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t6 = SIMD_MUL(t6, const_value_w); - tz = SIMD_FMA(ty, t6, tz); - tw = SIMD_FMA(tx, t6, tw); - SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t7 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t7 = SIMD_MUL(t7, const_value_w); - tz = SIMD_FMA(ty, t7, tz); - tw = SIMD_FMA(tx, t7, tw); - SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t8 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t8 = SIMD_MUL(t8, const_value_w); - tz = SIMD_FMA(ty, t8, tz); - tw = SIMD_FMA(tx, t8, tw); - SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t9 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t9 = SIMD_MUL(t9, const_value_w); - tz = SIMD_FMA(ty, t9, tz); - tw = SIMD_FMA(tx, t9, tw); - SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - - Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); - rcp_k = SCALAR_RECIPROCAL(k + 1); - comb_p_k = SCALAR_MUL(comb_p_k, p - k); - comb_p_k = SCALAR_MUL(comb_p_k, rcp_k); - } - - Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); - rcp_j = SCALAR_RECIPROCAL(j + 1); - comb_n_j = SCALAR_MUL(comb_n_j, n - j); - comb_n_j = SCALAR_MUL(comb_n_j, rcp_j); - } - - X_ABp = SCALAR_MUL(X_ABp, X_AB); - rcp_i = SCALAR_RECIPROCAL(i + 1); - comb_m_i = SCALAR_MUL(comb_m_i, m - i); - comb_m_i = SCALAR_MUL(comb_m_i, rcp_i); - } - } - } + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 3, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(3)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 3, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(3)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); } for(; p_inner < npts_inner; p_inner += SCALAR_LENGTH) { @@ -3905,162 +15247,5833 @@ void integral_3_3(size_t npts, SCALAR_TYPE const_value_v = SCALAR_LOAD((weights + p_outer + p_inner)); - for(int c0 = 0; c0 <= 3; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 3 - c0; - int n = c0 - c1; - int p = c1; - - int idxB = (((3 - m) * (3 - m + 1)) >> 1) + p; - - double X_ABp = 1.0, comb_m_i = 1.0; - for(int i = 0; i <= m; ++i) { - double rcp_i; - - double Y_ABp = 1.0, comb_n_j = 1.0; - for(int j = 0; j <= n; ++j) { - double rcp_j; + double const_value, X_ABp, Y_ABp, Z_ABp, comb_m_i, comb_n_j, comb_p_k; + SCALAR_TYPE const_value_w; + SCALAR_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9; - double Z_ABp = 1.0, comb_p_k = 1.0; - for(int k = 0; k <= p; ++k) { - double rcp_k; - int mv, pv, Lv = 6 - i - j - k; - - int offset = (Lv * (Lv + 1) * (Lv + 2) - 60) / 6; - double const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; - SCALAR_TYPE tx, ty, tz, tw; - SCALAR_TYPE const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); - - mv = 3 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 0 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 0 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t0 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t0 = SCALAR_MUL(t0, const_value_w); - tz = SCALAR_FMA(ty, t0, tz); - tw = SCALAR_FMA(tx, t0, tw); - SCALAR_STORE((Gik + 0 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 1 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 1 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t1 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t1 = SCALAR_MUL(t1, const_value_w); - tz = SCALAR_FMA(ty, t1, tz); - tw = SCALAR_FMA(tx, t1, tw); - SCALAR_STORE((Gik + 1 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 2 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 2 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t2 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t2 = SCALAR_MUL(t2, const_value_w); - tz = SCALAR_FMA(ty, t2, tz); - tw = SCALAR_FMA(tx, t2, tw); - SCALAR_STORE((Gik + 2 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 3 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 3 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t3 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t3 = SCALAR_MUL(t3, const_value_w); - tz = SCALAR_FMA(ty, t3, tz); - tw = SCALAR_FMA(tx, t3, tw); - SCALAR_STORE((Gik + 3 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 4 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 4 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t4 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t4 = SCALAR_MUL(t4, const_value_w); - tz = SCALAR_FMA(ty, t4, tz); - tw = SCALAR_FMA(tx, t4, tw); - SCALAR_STORE((Gik + 4 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 2 + p - k; - tx = SCALAR_LOAD((Xik + 5 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 5 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t5 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t5 = SCALAR_MUL(t5, const_value_w); - tz = SCALAR_FMA(ty, t5, tz); - tw = SCALAR_FMA(tx, t5, tw); - SCALAR_STORE((Gik + 5 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 6 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 6 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t6 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t6 = SCALAR_MUL(t6, const_value_w); - tz = SCALAR_FMA(ty, t6, tz); - tw = SCALAR_FMA(tx, t6, tw); - SCALAR_STORE((Gik + 6 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 7 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 7 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t7 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t7 = SCALAR_MUL(t7, const_value_w); - tz = SCALAR_FMA(ty, t7, tz); - tw = SCALAR_FMA(tx, t7, tw); - SCALAR_STORE((Gik + 7 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 2 + p - k; - tx = SCALAR_LOAD((Xik + 8 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 8 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t8 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t8 = SCALAR_MUL(t8, const_value_w); - tz = SCALAR_FMA(ty, t8, tz); - tw = SCALAR_FMA(tx, t8, tw); - SCALAR_STORE((Gik + 8 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 3 + p - k; - tx = SCALAR_LOAD((Xik + 9 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 9 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t9 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t9 = SCALAR_MUL(t9, const_value_w); - tz = SCALAR_FMA(ty, t9, tz); - tw = SCALAR_FMA(tx, t9, tw); - SCALAR_STORE((Gik + 9 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - - Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); - rcp_k = SCALAR_RECIPROCAL(k + 1); - comb_p_k = SCALAR_MUL(comb_p_k, p - k); - comb_p_k = SCALAR_MUL(comb_p_k, rcp_k); - } - - Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); - rcp_j = SCALAR_RECIPROCAL(j + 1); - comb_n_j = SCALAR_MUL(comb_n_j, n - j); - comb_n_j = SCALAR_MUL(comb_n_j, rcp_j); - } - - X_ABp = SCALAR_MUL(X_ABp, X_AB); - rcp_i = SCALAR_RECIPROCAL(i + 1); - comb_m_i = SCALAR_MUL(comb_m_i, m - i); - comb_m_i = SCALAR_MUL(comb_m_i, rcp_i); - } - } - } + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 3, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(3)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t0 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t1 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t2 = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t3 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t4 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t5 = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t6 = SCALAR_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t7 = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t8 = SCALAR_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t9 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 3, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t0 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t1 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t2 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t3 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t4 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t5 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t6 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t7 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t8 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t9 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t0 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t1 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t2 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t3 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t4 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t5 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t6 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t7 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t8 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t9 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(3)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t0 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t1 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t2 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t3 = SCALAR_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t4 = SCALAR_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t5 = SCALAR_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t6 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t7 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t8 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t9 = SCALAR_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t0 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t1 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t2 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t3 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t4 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t5 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t6 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t7 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t8 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t9 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t0 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t1 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t2 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t3 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t4 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t5 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); } } } diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4.cxx index 0acd4495..7bb8f02c 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4.cxx @@ -17,7 +17,7 @@ namespace XCPU { void integral_4(size_t npts, double *_points, point rA, - point /*rB*/, + point rB, int nprim_pairs, prim_pair *prim_pairs, double *Xi, @@ -2810,174 +2810,2038 @@ void integral_4(size_t npts, double *Xik = (Xi + p_outer + p_inner); double *Gik = (Gi + p_outer + p_inner); - for(int c0 = 0; c0 <= 4; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 4 - c0; - int p = c1; - - int idxB = (((4 - m) * (4 - m + 1)) >> 1) + p; - - int mv, pv; - - SIMD_TYPE tx, wg, xik, gik; - mv = 4 + m; pv = 0 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); - mv = 3 + m; pv = 0 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); - mv = 3 + m; pv = 1 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); - mv = 2 + m; pv = 0 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); - mv = 2 + m; pv = 1 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); - mv = 2 + m; pv = 2 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); - mv = 1 + m; pv = 0 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); - mv = 1 + m; pv = 1 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); - mv = 1 + m; pv = 2 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); - mv = 1 + m; pv = 3 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); - mv = 0 + m; pv = 0 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); - mv = 0 + m; pv = 1 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); - mv = 0 + m; pv = 2 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); - mv = 0 + m; pv = 3 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); - mv = 0 + m; pv = 4 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); - } - } + SIMD_TYPE tx, wg, xik, gik; + tx = SIMD_ALIGNED_LOAD((temp + 100 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 101 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 102 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 103 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 105 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 101 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 103 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 102 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 105 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 103 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 121 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 105 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 127 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 121 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 128 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 127 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 135 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 121 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 128 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 136 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 137 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 138 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 137 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 138 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 138 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 142 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 142 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 143 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 127 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 135 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 142 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 143 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 144 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); } } // cleanup code for(; p_outer < npts; p_outer += NPTS_LOCAL) { - size_t npts_inner = std::min((size_t) NPTS_LOCAL, npts - p_outer); + size_t npts_inner = std::min((size_t) NPTS_LOCAL, npts - p_outer); double *_point_outer = (_points + p_outer); double xA = rA.x; @@ -8503,336 +10367,4064 @@ void integral_4(size_t npts, double *Xik = (Xi + p_outer + p_inner); double *Gik = (Gi + p_outer + p_inner); - for(int c0 = 0; c0 <= 4; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 4 - c0; - int p = c1; - - int idxB = (((4 - m) * (4 - m + 1)) >> 1) + p; - - int mv, pv; - - SIMD_TYPE tx, wg, xik, gik; - mv = 4 + m; pv = 0 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); - mv = 3 + m; pv = 0 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); - mv = 3 + m; pv = 1 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); - mv = 2 + m; pv = 0 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); - mv = 2 + m; pv = 1 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); - mv = 2 + m; pv = 2 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); - mv = 1 + m; pv = 0 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); - mv = 1 + m; pv = 1 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); - mv = 1 + m; pv = 2 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); - mv = 1 + m; pv = 3 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); - mv = 0 + m; pv = 0 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); - mv = 0 + m; pv = 1 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); - mv = 0 + m; pv = 2 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); - mv = 0 + m; pv = 3 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); - mv = 0 + m; pv = 4 + p; - tx = SIMD_ALIGNED_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - - xik = SIMD_UNALIGNED_LOAD((Xik + idxB * ldX)); - gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); - - tx = SIMD_MUL(tx, wg); - gik = SIMD_FMA(tx, xik, gik); - SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); - } - } + SIMD_TYPE tx, wg, xik, gik; + tx = SIMD_ALIGNED_LOAD((temp + 100 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 101 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 102 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 103 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 105 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 101 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 103 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 102 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 105 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 103 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 121 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 105 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 127 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 121 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 128 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 127 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 135 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 121 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 128 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 136 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 137 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 138 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 137 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 138 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 138 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 142 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 142 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 143 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 127 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 135 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 142 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 143 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), gik); + tx = SIMD_ALIGNED_LOAD((temp + 144 * NPTS_LOCAL + p_inner)); + wg = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); + + xik = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + gik = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + + tx = SIMD_MUL(tx, wg); + gik = SIMD_FMA(tx, xik, gik); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), gik); } for(; p_inner < npts_inner; p_inner += SCALAR_LENGTH) { double *Xik = (Xi + p_outer + p_inner); double *Gik = (Gi + p_outer + p_inner); - for(int c0 = 0; c0 <= 4; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 4 - c0; - int p = c1; - - int idxB = (((4 - m) * (4 - m + 1)) >> 1) + p; - - int mv, pv; - - SCALAR_TYPE tx, wg, xik, gik; - mv = 4 + m; pv = 0 + p; - tx = SCALAR_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 0 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 0 * ldG), gik); - mv = 3 + m; pv = 0 + p; - tx = SCALAR_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 1 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 1 * ldG), gik); - mv = 3 + m; pv = 1 + p; - tx = SCALAR_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 2 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 2 * ldG), gik); - mv = 2 + m; pv = 0 + p; - tx = SCALAR_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 3 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 3 * ldG), gik); - mv = 2 + m; pv = 1 + p; - tx = SCALAR_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 4 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 4 * ldG), gik); - mv = 2 + m; pv = 2 + p; - tx = SCALAR_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 5 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 5 * ldG), gik); - mv = 1 + m; pv = 0 + p; - tx = SCALAR_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 6 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 6 * ldG), gik); - mv = 1 + m; pv = 1 + p; - tx = SCALAR_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 7 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 7 * ldG), gik); - mv = 1 + m; pv = 2 + p; - tx = SCALAR_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 8 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 8 * ldG), gik); - mv = 1 + m; pv = 3 + p; - tx = SCALAR_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 9 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 9 * ldG), gik); - mv = 0 + m; pv = 0 + p; - tx = SCALAR_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 10 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 10 * ldG), gik); - mv = 0 + m; pv = 1 + p; - tx = SCALAR_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 11 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 11 * ldG), gik); - mv = 0 + m; pv = 2 + p; - tx = SCALAR_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 12 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 12 * ldG), gik); - mv = 0 + m; pv = 3 + p; - tx = SCALAR_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 13 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 13 * ldG), gik); - mv = 0 + m; pv = 4 + p; - tx = SCALAR_LOAD((temp + (100 + (((8 - mv) * (8 - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - wg = SCALAR_LOAD((weights + p_outer + p_inner)); - - xik = SCALAR_LOAD((Xik + idxB * ldX)); - gik = SCALAR_LOAD((Gik + 14 * ldG)); - - tx = SCALAR_MUL(tx, wg); - gik = SCALAR_FMA(tx, xik, gik); - SCALAR_STORE((Gik + 14 * ldG), gik); - } - } + SCALAR_TYPE tx, wg, xik, gik; + tx = SCALAR_LOAD((temp + 100 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 101 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 102 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 103 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 105 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 10 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 10 * ldG), gik); + tx = SCALAR_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 11 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 11 * ldG), gik); + tx = SCALAR_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 12 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 12 * ldG), gik); + tx = SCALAR_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 13 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 13 * ldG), gik); + tx = SCALAR_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 0 * ldX)); + gik = SCALAR_LOAD((Gik + 14 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 14 * ldG), gik); + tx = SCALAR_LOAD((temp + 101 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 103 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 10 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 10 * ldG), gik); + tx = SCALAR_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 11 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 11 * ldG), gik); + tx = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 12 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 12 * ldG), gik); + tx = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 13 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 13 * ldG), gik); + tx = SCALAR_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 1 * ldX)); + gik = SCALAR_LOAD((Gik + 14 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 14 * ldG), gik); + tx = SCALAR_LOAD((temp + 102 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 105 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 10 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 10 * ldG), gik); + tx = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 11 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 11 * ldG), gik); + tx = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 12 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 12 * ldG), gik); + tx = SCALAR_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 13 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 13 * ldG), gik); + tx = SCALAR_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 2 * ldX)); + gik = SCALAR_LOAD((Gik + 14 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 14 * ldG), gik); + tx = SCALAR_LOAD((temp + 103 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 121 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 10 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 10 * ldG), gik); + tx = SCALAR_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 11 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 11 * ldG), gik); + tx = SCALAR_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 12 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 12 * ldG), gik); + tx = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 13 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 13 * ldG), gik); + tx = SCALAR_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 3 * ldX)); + gik = SCALAR_LOAD((Gik + 14 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 14 * ldG), gik); + tx = SCALAR_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 10 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 10 * ldG), gik); + tx = SCALAR_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 11 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 11 * ldG), gik); + tx = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 12 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 12 * ldG), gik); + tx = SCALAR_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 13 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 13 * ldG), gik); + tx = SCALAR_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 4 * ldX)); + gik = SCALAR_LOAD((Gik + 14 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 14 * ldG), gik); + tx = SCALAR_LOAD((temp + 105 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 10 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 10 * ldG), gik); + tx = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 11 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 11 * ldG), gik); + tx = SCALAR_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 12 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 12 * ldG), gik); + tx = SCALAR_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 13 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 13 * ldG), gik); + tx = SCALAR_LOAD((temp + 127 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 5 * ldX)); + gik = SCALAR_LOAD((Gik + 14 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 14 * ldG), gik); + tx = SCALAR_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 121 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 128 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 10 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 10 * ldG), gik); + tx = SCALAR_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 11 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 11 * ldG), gik); + tx = SCALAR_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 12 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 12 * ldG), gik); + tx = SCALAR_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 13 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 13 * ldG), gik); + tx = SCALAR_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 6 * ldX)); + gik = SCALAR_LOAD((Gik + 14 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 14 * ldG), gik); + tx = SCALAR_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 10 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 10 * ldG), gik); + tx = SCALAR_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 11 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 11 * ldG), gik); + tx = SCALAR_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 12 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 12 * ldG), gik); + tx = SCALAR_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 13 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 13 * ldG), gik); + tx = SCALAR_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 7 * ldX)); + gik = SCALAR_LOAD((Gik + 14 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 14 * ldG), gik); + tx = SCALAR_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 10 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 10 * ldG), gik); + tx = SCALAR_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 11 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 11 * ldG), gik); + tx = SCALAR_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 12 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 12 * ldG), gik); + tx = SCALAR_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 13 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 13 * ldG), gik); + tx = SCALAR_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 8 * ldX)); + gik = SCALAR_LOAD((Gik + 14 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 14 * ldG), gik); + tx = SCALAR_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 127 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 10 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 10 * ldG), gik); + tx = SCALAR_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 11 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 11 * ldG), gik); + tx = SCALAR_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 12 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 12 * ldG), gik); + tx = SCALAR_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 13 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 13 * ldG), gik); + tx = SCALAR_LOAD((temp + 135 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 9 * ldX)); + gik = SCALAR_LOAD((Gik + 14 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 14 * ldG), gik); + tx = SCALAR_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 10 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 10 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 10 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 121 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 10 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 10 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 10 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 128 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 10 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 10 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 10 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 10 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 136 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 10 * ldX)); + gik = SCALAR_LOAD((Gik + 10 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 10 * ldG), gik); + tx = SCALAR_LOAD((temp + 137 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 10 * ldX)); + gik = SCALAR_LOAD((Gik + 11 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 11 * ldG), gik); + tx = SCALAR_LOAD((temp + 138 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 10 * ldX)); + gik = SCALAR_LOAD((Gik + 12 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 12 * ldG), gik); + tx = SCALAR_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 10 * ldX)); + gik = SCALAR_LOAD((Gik + 13 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 13 * ldG), gik); + tx = SCALAR_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 10 * ldX)); + gik = SCALAR_LOAD((Gik + 14 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 14 * ldG), gik); + tx = SCALAR_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 11 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 11 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 11 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 11 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 11 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 11 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 11 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 11 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 11 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 11 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 137 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 11 * ldX)); + gik = SCALAR_LOAD((Gik + 10 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 10 * ldG), gik); + tx = SCALAR_LOAD((temp + 138 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 11 * ldX)); + gik = SCALAR_LOAD((Gik + 11 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 11 * ldG), gik); + tx = SCALAR_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 11 * ldX)); + gik = SCALAR_LOAD((Gik + 12 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 12 * ldG), gik); + tx = SCALAR_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 11 * ldX)); + gik = SCALAR_LOAD((Gik + 13 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 13 * ldG), gik); + tx = SCALAR_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 11 * ldX)); + gik = SCALAR_LOAD((Gik + 14 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 14 * ldG), gik); + tx = SCALAR_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 12 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 12 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 12 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 12 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 12 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 12 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 12 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 12 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 12 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 12 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 138 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 12 * ldX)); + gik = SCALAR_LOAD((Gik + 10 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 10 * ldG), gik); + tx = SCALAR_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 12 * ldX)); + gik = SCALAR_LOAD((Gik + 11 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 11 * ldG), gik); + tx = SCALAR_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 12 * ldX)); + gik = SCALAR_LOAD((Gik + 12 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 12 * ldG), gik); + tx = SCALAR_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 12 * ldX)); + gik = SCALAR_LOAD((Gik + 13 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 13 * ldG), gik); + tx = SCALAR_LOAD((temp + 142 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 12 * ldX)); + gik = SCALAR_LOAD((Gik + 14 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 14 * ldG), gik); + tx = SCALAR_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 13 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 13 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 13 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 13 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 13 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 13 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 13 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 13 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 13 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 13 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 13 * ldX)); + gik = SCALAR_LOAD((Gik + 10 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 10 * ldG), gik); + tx = SCALAR_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 13 * ldX)); + gik = SCALAR_LOAD((Gik + 11 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 11 * ldG), gik); + tx = SCALAR_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 13 * ldX)); + gik = SCALAR_LOAD((Gik + 12 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 12 * ldG), gik); + tx = SCALAR_LOAD((temp + 142 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 13 * ldX)); + gik = SCALAR_LOAD((Gik + 13 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 13 * ldG), gik); + tx = SCALAR_LOAD((temp + 143 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 13 * ldX)); + gik = SCALAR_LOAD((Gik + 14 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 14 * ldG), gik); + tx = SCALAR_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 14 * ldX)); + gik = SCALAR_LOAD((Gik + 0 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 0 * ldG), gik); + tx = SCALAR_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 14 * ldX)); + gik = SCALAR_LOAD((Gik + 1 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 1 * ldG), gik); + tx = SCALAR_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 14 * ldX)); + gik = SCALAR_LOAD((Gik + 2 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 2 * ldG), gik); + tx = SCALAR_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 14 * ldX)); + gik = SCALAR_LOAD((Gik + 3 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 3 * ldG), gik); + tx = SCALAR_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 14 * ldX)); + gik = SCALAR_LOAD((Gik + 4 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 4 * ldG), gik); + tx = SCALAR_LOAD((temp + 127 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 14 * ldX)); + gik = SCALAR_LOAD((Gik + 5 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 5 * ldG), gik); + tx = SCALAR_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 14 * ldX)); + gik = SCALAR_LOAD((Gik + 6 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 6 * ldG), gik); + tx = SCALAR_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 14 * ldX)); + gik = SCALAR_LOAD((Gik + 7 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 7 * ldG), gik); + tx = SCALAR_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 14 * ldX)); + gik = SCALAR_LOAD((Gik + 8 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 8 * ldG), gik); + tx = SCALAR_LOAD((temp + 135 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 14 * ldX)); + gik = SCALAR_LOAD((Gik + 9 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 9 * ldG), gik); + tx = SCALAR_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 14 * ldX)); + gik = SCALAR_LOAD((Gik + 10 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 10 * ldG), gik); + tx = SCALAR_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 14 * ldX)); + gik = SCALAR_LOAD((Gik + 11 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 11 * ldG), gik); + tx = SCALAR_LOAD((temp + 142 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 14 * ldX)); + gik = SCALAR_LOAD((Gik + 12 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 12 * ldG), gik); + tx = SCALAR_LOAD((temp + 143 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 14 * ldX)); + gik = SCALAR_LOAD((Gik + 13 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 13 * ldG), gik); + tx = SCALAR_LOAD((temp + 144 * NPTS_LOCAL + p_inner)); + wg = SCALAR_LOAD((weights + p_outer + p_inner)); + + xik = SCALAR_LOAD((Xik + 14 * ldX)); + gik = SCALAR_LOAD((Gik + 14 * ldG)); + + tx = SCALAR_MUL(tx, wg); + gik = SCALAR_FMA(tx, xik, gik); + SCALAR_STORE((Gik + 14 * ldG), gik); } } } diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_1.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_1.cxx index 8945bc3c..42b1e7b4 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_1.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_1.cxx @@ -726,222 +726,942 @@ void integral_4_1(size_t npts, SIMD_TYPE const_value_v = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - for(int c0 = 0; c0 <= 1; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 1 - c0; - int n = c0 - c1; - int p = c1; + double const_value, X_ABp, Y_ABp, Z_ABp, comb_m_i, comb_n_j, comb_p_k; + SIMD_TYPE const_value_w; + SIMD_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14; - int idxB = (((1 - m) * (1 - m + 1)) >> 1) + p; - - double X_ABp = 1.0, comb_m_i = 1.0; - for(int i = 0; i <= m; ++i) { - double rcp_i; - - double Y_ABp = 1.0, comb_n_j = 1.0; - for(int j = 0; j <= n; ++j) { - double rcp_j; - - double Z_ABp = 1.0, comb_p_k = 1.0; - for(int k = 0; k <= p; ++k) { - double rcp_k; - int mv, pv, Lv = 5 - i - j - k; - - int offset = (Lv * (Lv + 1) * (Lv + 2) - 120) / 6; - double const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; - SIMD_TYPE tx, ty, tz, tw; - SIMD_TYPE const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); - - mv = 4 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t0 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t0 = SIMD_MUL(t0, const_value_w); - tz = SIMD_FMA(ty, t0, tz); - tw = SIMD_FMA(tx, t0, tw); - SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t1 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t1 = SIMD_MUL(t1, const_value_w); - tz = SIMD_FMA(ty, t1, tz); - tw = SIMD_FMA(tx, t1, tw); - SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t2 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t2 = SIMD_MUL(t2, const_value_w); - tz = SIMD_FMA(ty, t2, tz); - tw = SIMD_FMA(tx, t2, tw); - SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t3 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t3 = SIMD_MUL(t3, const_value_w); - tz = SIMD_FMA(ty, t3, tz); - tw = SIMD_FMA(tx, t3, tw); - SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t4 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t4 = SIMD_MUL(t4, const_value_w); - tz = SIMD_FMA(ty, t4, tz); - tw = SIMD_FMA(tx, t4, tw); - SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t5 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t5 = SIMD_MUL(t5, const_value_w); - tz = SIMD_FMA(ty, t5, tz); - tw = SIMD_FMA(tx, t5, tw); - SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t6 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t6 = SIMD_MUL(t6, const_value_w); - tz = SIMD_FMA(ty, t6, tz); - tw = SIMD_FMA(tx, t6, tw); - SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t7 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t7 = SIMD_MUL(t7, const_value_w); - tz = SIMD_FMA(ty, t7, tz); - tw = SIMD_FMA(tx, t7, tw); - SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t8 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t8 = SIMD_MUL(t8, const_value_w); - tz = SIMD_FMA(ty, t8, tz); - tw = SIMD_FMA(tx, t8, tw); - SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t9 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t9 = SIMD_MUL(t9, const_value_w); - tz = SIMD_FMA(ty, t9, tz); - tw = SIMD_FMA(tx, t9, tw); - SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t10 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t10 = SIMD_MUL(t10, const_value_w); - tz = SIMD_FMA(ty, t10, tz); - tw = SIMD_FMA(tx, t10, tw); - SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t11 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t11 = SIMD_MUL(t11, const_value_w); - tz = SIMD_FMA(ty, t11, tz); - tw = SIMD_FMA(tx, t11, tw); - SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t12 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t12 = SIMD_MUL(t12, const_value_w); - tz = SIMD_FMA(ty, t12, tz); - tw = SIMD_FMA(tx, t12, tw); - SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t13 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t13 = SIMD_MUL(t13, const_value_w); - tz = SIMD_FMA(ty, t13, tz); - tw = SIMD_FMA(tx, t13, tw); - SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 4 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t14 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t14 = SIMD_MUL(t14, const_value_w); - tz = SIMD_FMA(ty, t14, tz); - tw = SIMD_FMA(tx, t14, tw); - SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - - Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); - rcp_k = SCALAR_RECIPROCAL(k + 1); - comb_p_k = SCALAR_MUL(comb_p_k, p - k); - comb_p_k = SCALAR_MUL(comb_p_k, rcp_k); - } - - Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); - rcp_j = SCALAR_RECIPROCAL(j + 1); - comb_n_j = SCALAR_MUL(comb_n_j, n - j); - comb_n_j = SCALAR_MUL(comb_n_j, rcp_j); - } - - X_ABp = SCALAR_MUL(X_ABp, X_AB); - rcp_i = SCALAR_RECIPROCAL(i + 1); - comb_m_i = SCALAR_MUL(comb_m_i, m - i); - comb_m_i = SCALAR_MUL(comb_m_i, rcp_i); - } - } - } + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); } } for(; p_outer < npts; p_outer += NPTS_LOCAL) { - size_t npts_inner = std::min((size_t) NPTS_LOCAL, npts - p_outer); + size_t npts_inner = std::min((size_t) NPTS_LOCAL, npts - p_outer); double *_point_outer = (_points + p_outer); double X_AB = rA.x - rB.x; @@ -2288,217 +3008,937 @@ void integral_4_1(size_t npts, SIMD_TYPE const_value_v = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - for(int c0 = 0; c0 <= 1; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 1 - c0; - int n = c0 - c1; - int p = c1; - - int idxB = (((1 - m) * (1 - m + 1)) >> 1) + p; - - double X_ABp = 1.0, comb_m_i = 1.0; - for(int i = 0; i <= m; ++i) { - double rcp_i; - - double Y_ABp = 1.0, comb_n_j = 1.0; - for(int j = 0; j <= n; ++j) { - double rcp_j; - - double Z_ABp = 1.0, comb_p_k = 1.0; - for(int k = 0; k <= p; ++k) { - double rcp_k; - int mv, pv, Lv = 5 - i - j - k; - - int offset = (Lv * (Lv + 1) * (Lv + 2) - 120) / 6; - double const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; - SIMD_TYPE tx, ty, tz, tw; - SIMD_TYPE const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + double const_value, X_ABp, Y_ABp, Z_ABp, comb_m_i, comb_n_j, comb_p_k; + SIMD_TYPE const_value_w; + SIMD_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14; - mv = 4 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t0 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t0 = SIMD_MUL(t0, const_value_w); - tz = SIMD_FMA(ty, t0, tz); - tw = SIMD_FMA(tx, t0, tw); - SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t1 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t1 = SIMD_MUL(t1, const_value_w); - tz = SIMD_FMA(ty, t1, tz); - tw = SIMD_FMA(tx, t1, tw); - SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t2 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t2 = SIMD_MUL(t2, const_value_w); - tz = SIMD_FMA(ty, t2, tz); - tw = SIMD_FMA(tx, t2, tw); - SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t3 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t3 = SIMD_MUL(t3, const_value_w); - tz = SIMD_FMA(ty, t3, tz); - tw = SIMD_FMA(tx, t3, tw); - SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t4 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t4 = SIMD_MUL(t4, const_value_w); - tz = SIMD_FMA(ty, t4, tz); - tw = SIMD_FMA(tx, t4, tw); - SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t5 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t5 = SIMD_MUL(t5, const_value_w); - tz = SIMD_FMA(ty, t5, tz); - tw = SIMD_FMA(tx, t5, tw); - SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t6 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t6 = SIMD_MUL(t6, const_value_w); - tz = SIMD_FMA(ty, t6, tz); - tw = SIMD_FMA(tx, t6, tw); - SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t7 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t7 = SIMD_MUL(t7, const_value_w); - tz = SIMD_FMA(ty, t7, tz); - tw = SIMD_FMA(tx, t7, tw); - SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t8 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t8 = SIMD_MUL(t8, const_value_w); - tz = SIMD_FMA(ty, t8, tz); - tw = SIMD_FMA(tx, t8, tw); - SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t9 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t9 = SIMD_MUL(t9, const_value_w); - tz = SIMD_FMA(ty, t9, tz); - tw = SIMD_FMA(tx, t9, tw); - SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t10 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t10 = SIMD_MUL(t10, const_value_w); - tz = SIMD_FMA(ty, t10, tz); - tw = SIMD_FMA(tx, t10, tw); - SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t11 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t11 = SIMD_MUL(t11, const_value_w); - tz = SIMD_FMA(ty, t11, tz); - tw = SIMD_FMA(tx, t11, tw); - SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t12 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t12 = SIMD_MUL(t12, const_value_w); - tz = SIMD_FMA(ty, t12, tz); - tw = SIMD_FMA(tx, t12, tw); - SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t13 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t13 = SIMD_MUL(t13, const_value_w); - tz = SIMD_FMA(ty, t13, tz); - tw = SIMD_FMA(tx, t13, tw); - SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 4 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t14 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t14 = SIMD_MUL(t14, const_value_w); - tz = SIMD_FMA(ty, t14, tz); - tw = SIMD_FMA(tx, t14, tw); - SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - - Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); - rcp_k = SCALAR_RECIPROCAL(k + 1); - comb_p_k = SCALAR_MUL(comb_p_k, p - k); - comb_p_k = SCALAR_MUL(comb_p_k, rcp_k); - } - - Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); - rcp_j = SCALAR_RECIPROCAL(j + 1); - comb_n_j = SCALAR_MUL(comb_n_j, n - j); - comb_n_j = SCALAR_MUL(comb_n_j, rcp_j); - } - - X_ABp = SCALAR_MUL(X_ABp, X_AB); - rcp_i = SCALAR_RECIPROCAL(i + 1); - comb_m_i = SCALAR_MUL(comb_m_i, m - i); - comb_m_i = SCALAR_MUL(comb_m_i, rcp_i); - } - } - } + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); } for(; p_inner < npts_inner; p_inner += SCALAR_LENGTH) { @@ -2509,217 +3949,937 @@ void integral_4_1(size_t npts, SCALAR_TYPE const_value_v = SCALAR_LOAD((weights + p_outer + p_inner)); - for(int c0 = 0; c0 <= 1; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 1 - c0; - int n = c0 - c1; - int p = c1; - - int idxB = (((1 - m) * (1 - m + 1)) >> 1) + p; - - double X_ABp = 1.0, comb_m_i = 1.0; - for(int i = 0; i <= m; ++i) { - double rcp_i; - - double Y_ABp = 1.0, comb_n_j = 1.0; - for(int j = 0; j <= n; ++j) { - double rcp_j; + double const_value, X_ABp, Y_ABp, Z_ABp, comb_m_i, comb_n_j, comb_p_k; + SCALAR_TYPE const_value_w; + SCALAR_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14; - double Z_ABp = 1.0, comb_p_k = 1.0; - for(int k = 0; k <= p; ++k) { - double rcp_k; - int mv, pv, Lv = 5 - i - j - k; - - int offset = (Lv * (Lv + 1) * (Lv + 2) - 120) / 6; - double const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; - SCALAR_TYPE tx, ty, tz, tw; - SCALAR_TYPE const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); - - mv = 4 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 0 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 0 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t0 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t0 = SCALAR_MUL(t0, const_value_w); - tz = SCALAR_FMA(ty, t0, tz); - tw = SCALAR_FMA(tx, t0, tw); - SCALAR_STORE((Gik + 0 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 1 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 1 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t1 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t1 = SCALAR_MUL(t1, const_value_w); - tz = SCALAR_FMA(ty, t1, tz); - tw = SCALAR_FMA(tx, t1, tw); - SCALAR_STORE((Gik + 1 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 2 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 2 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t2 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t2 = SCALAR_MUL(t2, const_value_w); - tz = SCALAR_FMA(ty, t2, tz); - tw = SCALAR_FMA(tx, t2, tw); - SCALAR_STORE((Gik + 2 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 3 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 3 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t3 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t3 = SCALAR_MUL(t3, const_value_w); - tz = SCALAR_FMA(ty, t3, tz); - tw = SCALAR_FMA(tx, t3, tw); - SCALAR_STORE((Gik + 3 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 4 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 4 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t4 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t4 = SCALAR_MUL(t4, const_value_w); - tz = SCALAR_FMA(ty, t4, tz); - tw = SCALAR_FMA(tx, t4, tw); - SCALAR_STORE((Gik + 4 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 2 + p - k; - tx = SCALAR_LOAD((Xik + 5 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 5 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t5 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t5 = SCALAR_MUL(t5, const_value_w); - tz = SCALAR_FMA(ty, t5, tz); - tw = SCALAR_FMA(tx, t5, tw); - SCALAR_STORE((Gik + 5 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 6 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 6 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t6 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t6 = SCALAR_MUL(t6, const_value_w); - tz = SCALAR_FMA(ty, t6, tz); - tw = SCALAR_FMA(tx, t6, tw); - SCALAR_STORE((Gik + 6 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 7 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 7 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t7 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t7 = SCALAR_MUL(t7, const_value_w); - tz = SCALAR_FMA(ty, t7, tz); - tw = SCALAR_FMA(tx, t7, tw); - SCALAR_STORE((Gik + 7 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 2 + p - k; - tx = SCALAR_LOAD((Xik + 8 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 8 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t8 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t8 = SCALAR_MUL(t8, const_value_w); - tz = SCALAR_FMA(ty, t8, tz); - tw = SCALAR_FMA(tx, t8, tw); - SCALAR_STORE((Gik + 8 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 3 + p - k; - tx = SCALAR_LOAD((Xik + 9 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 9 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t9 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t9 = SCALAR_MUL(t9, const_value_w); - tz = SCALAR_FMA(ty, t9, tz); - tw = SCALAR_FMA(tx, t9, tw); - SCALAR_STORE((Gik + 9 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 10 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 10 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t10 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t10 = SCALAR_MUL(t10, const_value_w); - tz = SCALAR_FMA(ty, t10, tz); - tw = SCALAR_FMA(tx, t10, tw); - SCALAR_STORE((Gik + 10 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 11 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 11 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t11 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t11 = SCALAR_MUL(t11, const_value_w); - tz = SCALAR_FMA(ty, t11, tz); - tw = SCALAR_FMA(tx, t11, tw); - SCALAR_STORE((Gik + 11 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 2 + p - k; - tx = SCALAR_LOAD((Xik + 12 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 12 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t12 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t12 = SCALAR_MUL(t12, const_value_w); - tz = SCALAR_FMA(ty, t12, tz); - tw = SCALAR_FMA(tx, t12, tw); - SCALAR_STORE((Gik + 12 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 3 + p - k; - tx = SCALAR_LOAD((Xik + 13 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 13 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t13 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t13 = SCALAR_MUL(t13, const_value_w); - tz = SCALAR_FMA(ty, t13, tz); - tw = SCALAR_FMA(tx, t13, tw); - SCALAR_STORE((Gik + 13 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 4 + p - k; - tx = SCALAR_LOAD((Xik + 14 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 14 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t14 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t14 = SCALAR_MUL(t14, const_value_w); - tz = SCALAR_FMA(ty, t14, tz); - tw = SCALAR_FMA(tx, t14, tw); - SCALAR_STORE((Gik + 14 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - - Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); - rcp_k = SCALAR_RECIPROCAL(k + 1); - comb_p_k = SCALAR_MUL(comb_p_k, p - k); - comb_p_k = SCALAR_MUL(comb_p_k, rcp_k); - } - - Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); - rcp_j = SCALAR_RECIPROCAL(j + 1); - comb_n_j = SCALAR_MUL(comb_n_j, n - j); - comb_n_j = SCALAR_MUL(comb_n_j, rcp_j); - } - - X_ABp = SCALAR_MUL(X_ABp, X_AB); - rcp_i = SCALAR_RECIPROCAL(i + 1); - comb_m_i = SCALAR_MUL(comb_m_i, m - i); - comb_m_i = SCALAR_MUL(comb_m_i, rcp_i); - } - } - } + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); } } } diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_2.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_2.cxx index cba4ee4c..08d8d177 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_2.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_2.cxx @@ -13,7 +13,6 @@ #define PI 3.14159265358979323846 - namespace XCPU { void integral_4_2(size_t npts, double *_points, @@ -1199,222 +1198,3253 @@ void integral_4_2(size_t npts, SIMD_TYPE const_value_v = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - for(int c0 = 0; c0 <= 2; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 2 - c0; - int n = c0 - c1; - int p = c1; - - int idxB = (((2 - m) * (2 - m + 1)) >> 1) + p; - - double X_ABp = 1.0, comb_m_i = 1.0; - for(int i = 0; i <= m; ++i) { - double rcp_i; - - double Y_ABp = 1.0, comb_n_j = 1.0; - for(int j = 0; j <= n; ++j) { - double rcp_j; - - double Z_ABp = 1.0, comb_p_k = 1.0; - for(int k = 0; k <= p; ++k) { - double rcp_k; - int mv, pv, Lv = 6 - i - j - k; - - int offset = (Lv * (Lv + 1) * (Lv + 2) - 120) / 6; - double const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; - SIMD_TYPE tx, ty, tz, tw; - SIMD_TYPE const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); - - mv = 4 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t0 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t0 = SIMD_MUL(t0, const_value_w); - tz = SIMD_FMA(ty, t0, tz); - tw = SIMD_FMA(tx, t0, tw); - SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t1 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t1 = SIMD_MUL(t1, const_value_w); - tz = SIMD_FMA(ty, t1, tz); - tw = SIMD_FMA(tx, t1, tw); - SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t2 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t2 = SIMD_MUL(t2, const_value_w); - tz = SIMD_FMA(ty, t2, tz); - tw = SIMD_FMA(tx, t2, tw); - SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t3 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t3 = SIMD_MUL(t3, const_value_w); - tz = SIMD_FMA(ty, t3, tz); - tw = SIMD_FMA(tx, t3, tw); - SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t4 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t4 = SIMD_MUL(t4, const_value_w); - tz = SIMD_FMA(ty, t4, tz); - tw = SIMD_FMA(tx, t4, tw); - SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t5 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t5 = SIMD_MUL(t5, const_value_w); - tz = SIMD_FMA(ty, t5, tz); - tw = SIMD_FMA(tx, t5, tw); - SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t6 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t6 = SIMD_MUL(t6, const_value_w); - tz = SIMD_FMA(ty, t6, tz); - tw = SIMD_FMA(tx, t6, tw); - SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t7 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t7 = SIMD_MUL(t7, const_value_w); - tz = SIMD_FMA(ty, t7, tz); - tw = SIMD_FMA(tx, t7, tw); - SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t8 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t8 = SIMD_MUL(t8, const_value_w); - tz = SIMD_FMA(ty, t8, tz); - tw = SIMD_FMA(tx, t8, tw); - SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t9 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t9 = SIMD_MUL(t9, const_value_w); - tz = SIMD_FMA(ty, t9, tz); - tw = SIMD_FMA(tx, t9, tw); - SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t10 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t10 = SIMD_MUL(t10, const_value_w); - tz = SIMD_FMA(ty, t10, tz); - tw = SIMD_FMA(tx, t10, tw); - SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t11 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t11 = SIMD_MUL(t11, const_value_w); - tz = SIMD_FMA(ty, t11, tz); - tw = SIMD_FMA(tx, t11, tw); - SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t12 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t12 = SIMD_MUL(t12, const_value_w); - tz = SIMD_FMA(ty, t12, tz); - tw = SIMD_FMA(tx, t12, tw); - SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t13 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t13 = SIMD_MUL(t13, const_value_w); - tz = SIMD_FMA(ty, t13, tz); - tw = SIMD_FMA(tx, t13, tw); - SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 4 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t14 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t14 = SIMD_MUL(t14, const_value_w); - tz = SIMD_FMA(ty, t14, tz); - tw = SIMD_FMA(tx, t14, tw); - SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); + double const_value, X_ABp, Y_ABp, Z_ABp, comb_m_i, comb_n_j, comb_p_k; + SIMD_TYPE const_value_w; + SIMD_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14; - Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); - rcp_k = SCALAR_RECIPROCAL(k + 1); - comb_p_k = SCALAR_MUL(comb_p_k, p - k); - comb_p_k = SCALAR_MUL(comb_p_k, rcp_k); - } - - Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); - rcp_j = SCALAR_RECIPROCAL(j + 1); - comb_n_j = SCALAR_MUL(comb_n_j, n - j); - comb_n_j = SCALAR_MUL(comb_n_j, rcp_j); - } - - X_ABp = SCALAR_MUL(X_ABp, X_AB); - rcp_i = SCALAR_RECIPROCAL(i + 1); - comb_m_i = SCALAR_MUL(comb_m_i, m - i); - comb_m_i = SCALAR_MUL(comb_m_i, rcp_i); - } - } - } + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); } } for(; p_outer < npts; p_outer += NPTS_LOCAL) { - size_t npts_inner = std::min((size_t) NPTS_LOCAL, npts - p_outer); + size_t npts_inner = std::min((size_t) NPTS_LOCAL, npts - p_outer); double *_point_outer = (_points + p_outer); double X_AB = rA.x - rB.x; @@ -3705,217 +6735,3248 @@ void integral_4_2(size_t npts, SIMD_TYPE const_value_v = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - for(int c0 = 0; c0 <= 2; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 2 - c0; - int n = c0 - c1; - int p = c1; - - int idxB = (((2 - m) * (2 - m + 1)) >> 1) + p; - - double X_ABp = 1.0, comb_m_i = 1.0; - for(int i = 0; i <= m; ++i) { - double rcp_i; - - double Y_ABp = 1.0, comb_n_j = 1.0; - for(int j = 0; j <= n; ++j) { - double rcp_j; - - double Z_ABp = 1.0, comb_p_k = 1.0; - for(int k = 0; k <= p; ++k) { - double rcp_k; - int mv, pv, Lv = 6 - i - j - k; + double const_value, X_ABp, Y_ABp, Z_ABp, comb_m_i, comb_n_j, comb_p_k; + SIMD_TYPE const_value_w; + SIMD_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14; - int offset = (Lv * (Lv + 1) * (Lv + 2) - 120) / 6; - double const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; - SIMD_TYPE tx, ty, tz, tw; - SIMD_TYPE const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); - - mv = 4 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t0 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t0 = SIMD_MUL(t0, const_value_w); - tz = SIMD_FMA(ty, t0, tz); - tw = SIMD_FMA(tx, t0, tw); - SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t1 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t1 = SIMD_MUL(t1, const_value_w); - tz = SIMD_FMA(ty, t1, tz); - tw = SIMD_FMA(tx, t1, tw); - SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t2 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t2 = SIMD_MUL(t2, const_value_w); - tz = SIMD_FMA(ty, t2, tz); - tw = SIMD_FMA(tx, t2, tw); - SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t3 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t3 = SIMD_MUL(t3, const_value_w); - tz = SIMD_FMA(ty, t3, tz); - tw = SIMD_FMA(tx, t3, tw); - SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t4 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t4 = SIMD_MUL(t4, const_value_w); - tz = SIMD_FMA(ty, t4, tz); - tw = SIMD_FMA(tx, t4, tw); - SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t5 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t5 = SIMD_MUL(t5, const_value_w); - tz = SIMD_FMA(ty, t5, tz); - tw = SIMD_FMA(tx, t5, tw); - SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t6 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t6 = SIMD_MUL(t6, const_value_w); - tz = SIMD_FMA(ty, t6, tz); - tw = SIMD_FMA(tx, t6, tw); - SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t7 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t7 = SIMD_MUL(t7, const_value_w); - tz = SIMD_FMA(ty, t7, tz); - tw = SIMD_FMA(tx, t7, tw); - SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t8 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t8 = SIMD_MUL(t8, const_value_w); - tz = SIMD_FMA(ty, t8, tz); - tw = SIMD_FMA(tx, t8, tw); - SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t9 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t9 = SIMD_MUL(t9, const_value_w); - tz = SIMD_FMA(ty, t9, tz); - tw = SIMD_FMA(tx, t9, tw); - SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t10 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t10 = SIMD_MUL(t10, const_value_w); - tz = SIMD_FMA(ty, t10, tz); - tw = SIMD_FMA(tx, t10, tw); - SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t11 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t11 = SIMD_MUL(t11, const_value_w); - tz = SIMD_FMA(ty, t11, tz); - tw = SIMD_FMA(tx, t11, tw); - SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t12 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t12 = SIMD_MUL(t12, const_value_w); - tz = SIMD_FMA(ty, t12, tz); - tw = SIMD_FMA(tx, t12, tw); - SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t13 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t13 = SIMD_MUL(t13, const_value_w); - tz = SIMD_FMA(ty, t13, tz); - tw = SIMD_FMA(tx, t13, tw); - SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 4 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t14 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t14 = SIMD_MUL(t14, const_value_w); - tz = SIMD_FMA(ty, t14, tz); - tw = SIMD_FMA(tx, t14, tw); - SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - - Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); - rcp_k = SCALAR_RECIPROCAL(k + 1); - comb_p_k = SCALAR_MUL(comb_p_k, p - k); - comb_p_k = SCALAR_MUL(comb_p_k, rcp_k); - } - - Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); - rcp_j = SCALAR_RECIPROCAL(j + 1); - comb_n_j = SCALAR_MUL(comb_n_j, n - j); - comb_n_j = SCALAR_MUL(comb_n_j, rcp_j); - } - - X_ABp = SCALAR_MUL(X_ABp, X_AB); - rcp_i = SCALAR_RECIPROCAL(i + 1); - comb_m_i = SCALAR_MUL(comb_m_i, m - i); - comb_m_i = SCALAR_MUL(comb_m_i, rcp_i); - } - } - } + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); } for(; p_inner < npts_inner; p_inner += SCALAR_LENGTH) { @@ -3926,217 +9987,3248 @@ void integral_4_2(size_t npts, SCALAR_TYPE const_value_v = SCALAR_LOAD((weights + p_outer + p_inner)); - for(int c0 = 0; c0 <= 2; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 2 - c0; - int n = c0 - c1; - int p = c1; - - int idxB = (((2 - m) * (2 - m + 1)) >> 1) + p; - - double X_ABp = 1.0, comb_m_i = 1.0; - for(int i = 0; i <= m; ++i) { - double rcp_i; + double const_value, X_ABp, Y_ABp, Z_ABp, comb_m_i, comb_n_j, comb_p_k; + SCALAR_TYPE const_value_w; + SCALAR_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14; - double Y_ABp = 1.0, comb_n_j = 1.0; - for(int j = 0; j <= n; ++j) { - double rcp_j; - - double Z_ABp = 1.0, comb_p_k = 1.0; - for(int k = 0; k <= p; ++k) { - double rcp_k; - int mv, pv, Lv = 6 - i - j - k; - - int offset = (Lv * (Lv + 1) * (Lv + 2) - 120) / 6; - double const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; - SCALAR_TYPE tx, ty, tz, tw; - SCALAR_TYPE const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); - - mv = 4 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 0 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 0 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t0 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t0 = SCALAR_MUL(t0, const_value_w); - tz = SCALAR_FMA(ty, t0, tz); - tw = SCALAR_FMA(tx, t0, tw); - SCALAR_STORE((Gik + 0 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 1 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 1 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t1 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t1 = SCALAR_MUL(t1, const_value_w); - tz = SCALAR_FMA(ty, t1, tz); - tw = SCALAR_FMA(tx, t1, tw); - SCALAR_STORE((Gik + 1 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 2 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 2 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t2 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t2 = SCALAR_MUL(t2, const_value_w); - tz = SCALAR_FMA(ty, t2, tz); - tw = SCALAR_FMA(tx, t2, tw); - SCALAR_STORE((Gik + 2 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 3 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 3 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t3 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t3 = SCALAR_MUL(t3, const_value_w); - tz = SCALAR_FMA(ty, t3, tz); - tw = SCALAR_FMA(tx, t3, tw); - SCALAR_STORE((Gik + 3 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 4 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 4 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t4 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t4 = SCALAR_MUL(t4, const_value_w); - tz = SCALAR_FMA(ty, t4, tz); - tw = SCALAR_FMA(tx, t4, tw); - SCALAR_STORE((Gik + 4 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 2 + p - k; - tx = SCALAR_LOAD((Xik + 5 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 5 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t5 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t5 = SCALAR_MUL(t5, const_value_w); - tz = SCALAR_FMA(ty, t5, tz); - tw = SCALAR_FMA(tx, t5, tw); - SCALAR_STORE((Gik + 5 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 6 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 6 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t6 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t6 = SCALAR_MUL(t6, const_value_w); - tz = SCALAR_FMA(ty, t6, tz); - tw = SCALAR_FMA(tx, t6, tw); - SCALAR_STORE((Gik + 6 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 7 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 7 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t7 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t7 = SCALAR_MUL(t7, const_value_w); - tz = SCALAR_FMA(ty, t7, tz); - tw = SCALAR_FMA(tx, t7, tw); - SCALAR_STORE((Gik + 7 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 2 + p - k; - tx = SCALAR_LOAD((Xik + 8 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 8 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t8 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t8 = SCALAR_MUL(t8, const_value_w); - tz = SCALAR_FMA(ty, t8, tz); - tw = SCALAR_FMA(tx, t8, tw); - SCALAR_STORE((Gik + 8 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 3 + p - k; - tx = SCALAR_LOAD((Xik + 9 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 9 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t9 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t9 = SCALAR_MUL(t9, const_value_w); - tz = SCALAR_FMA(ty, t9, tz); - tw = SCALAR_FMA(tx, t9, tw); - SCALAR_STORE((Gik + 9 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 10 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 10 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t10 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t10 = SCALAR_MUL(t10, const_value_w); - tz = SCALAR_FMA(ty, t10, tz); - tw = SCALAR_FMA(tx, t10, tw); - SCALAR_STORE((Gik + 10 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 11 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 11 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t11 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t11 = SCALAR_MUL(t11, const_value_w); - tz = SCALAR_FMA(ty, t11, tz); - tw = SCALAR_FMA(tx, t11, tw); - SCALAR_STORE((Gik + 11 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 2 + p - k; - tx = SCALAR_LOAD((Xik + 12 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 12 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t12 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t12 = SCALAR_MUL(t12, const_value_w); - tz = SCALAR_FMA(ty, t12, tz); - tw = SCALAR_FMA(tx, t12, tw); - SCALAR_STORE((Gik + 12 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 3 + p - k; - tx = SCALAR_LOAD((Xik + 13 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 13 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t13 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t13 = SCALAR_MUL(t13, const_value_w); - tz = SCALAR_FMA(ty, t13, tz); - tw = SCALAR_FMA(tx, t13, tw); - SCALAR_STORE((Gik + 13 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 4 + p - k; - tx = SCALAR_LOAD((Xik + 14 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 14 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t14 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t14 = SCALAR_MUL(t14, const_value_w); - tz = SCALAR_FMA(ty, t14, tz); - tw = SCALAR_FMA(tx, t14, tw); - SCALAR_STORE((Gik + 14 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - - Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); - rcp_k = SCALAR_RECIPROCAL(k + 1); - comb_p_k = SCALAR_MUL(comb_p_k, p - k); - comb_p_k = SCALAR_MUL(comb_p_k, rcp_k); - } - - Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); - rcp_j = SCALAR_RECIPROCAL(j + 1); - comb_n_j = SCALAR_MUL(comb_n_j, n - j); - comb_n_j = SCALAR_MUL(comb_n_j, rcp_j); - } - - X_ABp = SCALAR_MUL(X_ABp, X_AB); - rcp_i = SCALAR_RECIPROCAL(i + 1); - comb_m_i = SCALAR_MUL(comb_m_i, m - i); - comb_m_i = SCALAR_MUL(comb_m_i, rcp_i); - } - } - } + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t10 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t11 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t12 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t13 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t14 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t10 = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t11 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t12 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t13 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t14 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t10 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t11 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t12 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t13 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t14 = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); } } } diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_3.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_3.cxx index c3b0d68e..e4541197 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_3.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_3.cxx @@ -1878,222 +1878,8638 @@ void integral_4_3(size_t npts, SIMD_TYPE const_value_v = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - for(int c0 = 0; c0 <= 3; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 3 - c0; - int n = c0 - c1; - int p = c1; + double const_value, X_ABp, Y_ABp, Z_ABp, comb_m_i, comb_n_j, comb_p_k; + SIMD_TYPE const_value_w; + SIMD_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14; - int idxB = (((3 - m) * (3 - m + 1)) >> 1) + p; - - double X_ABp = 1.0, comb_m_i = 1.0; - for(int i = 0; i <= m; ++i) { - double rcp_i; - - double Y_ABp = 1.0, comb_n_j = 1.0; - for(int j = 0; j <= n; ++j) { - double rcp_j; - - double Z_ABp = 1.0, comb_p_k = 1.0; - for(int k = 0; k <= p; ++k) { - double rcp_k; - int mv, pv, Lv = 7 - i - j - k; - - int offset = (Lv * (Lv + 1) * (Lv + 2) - 120) / 6; - double const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; - SIMD_TYPE tx, ty, tz, tw; - SIMD_TYPE const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); - - mv = 4 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t0 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t0 = SIMD_MUL(t0, const_value_w); - tz = SIMD_FMA(ty, t0, tz); - tw = SIMD_FMA(tx, t0, tw); - SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t1 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t1 = SIMD_MUL(t1, const_value_w); - tz = SIMD_FMA(ty, t1, tz); - tw = SIMD_FMA(tx, t1, tw); - SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t2 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t2 = SIMD_MUL(t2, const_value_w); - tz = SIMD_FMA(ty, t2, tz); - tw = SIMD_FMA(tx, t2, tw); - SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t3 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t3 = SIMD_MUL(t3, const_value_w); - tz = SIMD_FMA(ty, t3, tz); - tw = SIMD_FMA(tx, t3, tw); - SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t4 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t4 = SIMD_MUL(t4, const_value_w); - tz = SIMD_FMA(ty, t4, tz); - tw = SIMD_FMA(tx, t4, tw); - SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t5 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t5 = SIMD_MUL(t5, const_value_w); - tz = SIMD_FMA(ty, t5, tz); - tw = SIMD_FMA(tx, t5, tw); - SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t6 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t6 = SIMD_MUL(t6, const_value_w); - tz = SIMD_FMA(ty, t6, tz); - tw = SIMD_FMA(tx, t6, tw); - SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t7 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t7 = SIMD_MUL(t7, const_value_w); - tz = SIMD_FMA(ty, t7, tz); - tw = SIMD_FMA(tx, t7, tw); - SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t8 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t8 = SIMD_MUL(t8, const_value_w); - tz = SIMD_FMA(ty, t8, tz); - tw = SIMD_FMA(tx, t8, tw); - SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t9 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t9 = SIMD_MUL(t9, const_value_w); - tz = SIMD_FMA(ty, t9, tz); - tw = SIMD_FMA(tx, t9, tw); - SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t10 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t10 = SIMD_MUL(t10, const_value_w); - tz = SIMD_FMA(ty, t10, tz); - tw = SIMD_FMA(tx, t10, tw); - SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t11 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t11 = SIMD_MUL(t11, const_value_w); - tz = SIMD_FMA(ty, t11, tz); - tw = SIMD_FMA(tx, t11, tw); - SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t12 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t12 = SIMD_MUL(t12, const_value_w); - tz = SIMD_FMA(ty, t12, tz); - tw = SIMD_FMA(tx, t12, tw); - SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t13 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t13 = SIMD_MUL(t13, const_value_w); - tz = SIMD_FMA(ty, t13, tz); - tw = SIMD_FMA(tx, t13, tw); - SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 4 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t14 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t14 = SIMD_MUL(t14, const_value_w); - tz = SIMD_FMA(ty, t14, tz); - tw = SIMD_FMA(tx, t14, tw); - SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - - Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); - rcp_k = SCALAR_RECIPROCAL(k + 1); - comb_p_k = SCALAR_MUL(comb_p_k, p - k); - comb_p_k = SCALAR_MUL(comb_p_k, rcp_k); - } - - Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); - rcp_j = SCALAR_RECIPROCAL(j + 1); - comb_n_j = SCALAR_MUL(comb_n_j, n - j); - comb_n_j = SCALAR_MUL(comb_n_j, rcp_j); - } - - X_ABp = SCALAR_MUL(X_ABp, X_AB); - rcp_i = SCALAR_RECIPROCAL(i + 1); - comb_m_i = SCALAR_MUL(comb_m_i, m - i); - comb_m_i = SCALAR_MUL(comb_m_i, rcp_i); - } - } - } + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 3, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(3)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 92 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 3, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(3)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 99 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); } } for(; p_outer < npts; p_outer += NPTS_LOCAL) { - size_t npts_inner = std::min((size_t) NPTS_LOCAL, npts - p_outer); + size_t npts_inner = std::min((size_t) NPTS_LOCAL, npts - p_outer); double *_point_outer = (_points + p_outer); double X_AB = rA.x - rB.x; @@ -5744,217 +14160,8633 @@ void integral_4_3(size_t npts, SIMD_TYPE const_value_v = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - for(int c0 = 0; c0 <= 3; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 3 - c0; - int n = c0 - c1; - int p = c1; - - int idxB = (((3 - m) * (3 - m + 1)) >> 1) + p; - - double X_ABp = 1.0, comb_m_i = 1.0; - for(int i = 0; i <= m; ++i) { - double rcp_i; - - double Y_ABp = 1.0, comb_n_j = 1.0; - for(int j = 0; j <= n; ++j) { - double rcp_j; - - double Z_ABp = 1.0, comb_p_k = 1.0; - for(int k = 0; k <= p; ++k) { - double rcp_k; - int mv, pv, Lv = 7 - i - j - k; - - int offset = (Lv * (Lv + 1) * (Lv + 2) - 120) / 6; - double const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; - SIMD_TYPE tx, ty, tz, tw; - SIMD_TYPE const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + double const_value, X_ABp, Y_ABp, Z_ABp, comb_m_i, comb_n_j, comb_p_k; + SIMD_TYPE const_value_w; + SIMD_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14; - mv = 4 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t0 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t0 = SIMD_MUL(t0, const_value_w); - tz = SIMD_FMA(ty, t0, tz); - tw = SIMD_FMA(tx, t0, tw); - SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t1 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t1 = SIMD_MUL(t1, const_value_w); - tz = SIMD_FMA(ty, t1, tz); - tw = SIMD_FMA(tx, t1, tw); - SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t2 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t2 = SIMD_MUL(t2, const_value_w); - tz = SIMD_FMA(ty, t2, tz); - tw = SIMD_FMA(tx, t2, tw); - SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t3 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t3 = SIMD_MUL(t3, const_value_w); - tz = SIMD_FMA(ty, t3, tz); - tw = SIMD_FMA(tx, t3, tw); - SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t4 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t4 = SIMD_MUL(t4, const_value_w); - tz = SIMD_FMA(ty, t4, tz); - tw = SIMD_FMA(tx, t4, tw); - SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t5 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t5 = SIMD_MUL(t5, const_value_w); - tz = SIMD_FMA(ty, t5, tz); - tw = SIMD_FMA(tx, t5, tw); - SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t6 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t6 = SIMD_MUL(t6, const_value_w); - tz = SIMD_FMA(ty, t6, tz); - tw = SIMD_FMA(tx, t6, tw); - SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t7 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t7 = SIMD_MUL(t7, const_value_w); - tz = SIMD_FMA(ty, t7, tz); - tw = SIMD_FMA(tx, t7, tw); - SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t8 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t8 = SIMD_MUL(t8, const_value_w); - tz = SIMD_FMA(ty, t8, tz); - tw = SIMD_FMA(tx, t8, tw); - SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t9 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t9 = SIMD_MUL(t9, const_value_w); - tz = SIMD_FMA(ty, t9, tz); - tw = SIMD_FMA(tx, t9, tw); - SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t10 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t10 = SIMD_MUL(t10, const_value_w); - tz = SIMD_FMA(ty, t10, tz); - tw = SIMD_FMA(tx, t10, tw); - SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t11 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t11 = SIMD_MUL(t11, const_value_w); - tz = SIMD_FMA(ty, t11, tz); - tw = SIMD_FMA(tx, t11, tw); - SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t12 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t12 = SIMD_MUL(t12, const_value_w); - tz = SIMD_FMA(ty, t12, tz); - tw = SIMD_FMA(tx, t12, tw); - SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t13 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t13 = SIMD_MUL(t13, const_value_w); - tz = SIMD_FMA(ty, t13, tz); - tw = SIMD_FMA(tx, t13, tw); - SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 4 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t14 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t14 = SIMD_MUL(t14, const_value_w); - tz = SIMD_FMA(ty, t14, tz); - tw = SIMD_FMA(tx, t14, tw); - SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - - Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); - rcp_k = SCALAR_RECIPROCAL(k + 1); - comb_p_k = SCALAR_MUL(comb_p_k, p - k); - comb_p_k = SCALAR_MUL(comb_p_k, rcp_k); - } - - Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); - rcp_j = SCALAR_RECIPROCAL(j + 1); - comb_n_j = SCALAR_MUL(comb_n_j, n - j); - comb_n_j = SCALAR_MUL(comb_n_j, rcp_j); - } - - X_ABp = SCALAR_MUL(X_ABp, X_AB); - rcp_i = SCALAR_RECIPROCAL(i + 1); - comb_m_i = SCALAR_MUL(comb_m_i, m - i); - comb_m_i = SCALAR_MUL(comb_m_i, rcp_i); - } - } - } + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 3, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(3)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 92 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 3, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(3)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 99 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); } for(; p_inner < npts_inner; p_inner += SCALAR_LENGTH) { @@ -5965,217 +22797,8633 @@ void integral_4_3(size_t npts, SCALAR_TYPE const_value_v = SCALAR_LOAD((weights + p_outer + p_inner)); - for(int c0 = 0; c0 <= 3; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 3 - c0; - int n = c0 - c1; - int p = c1; - - int idxB = (((3 - m) * (3 - m + 1)) >> 1) + p; - - double X_ABp = 1.0, comb_m_i = 1.0; - for(int i = 0; i <= m; ++i) { - double rcp_i; - - double Y_ABp = 1.0, comb_n_j = 1.0; - for(int j = 0; j <= n; ++j) { - double rcp_j; + double const_value, X_ABp, Y_ABp, Z_ABp, comb_m_i, comb_n_j, comb_p_k; + SCALAR_TYPE const_value_w; + SCALAR_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14; - double Z_ABp = 1.0, comb_p_k = 1.0; - for(int k = 0; k <= p; ++k) { - double rcp_k; - int mv, pv, Lv = 7 - i - j - k; - - int offset = (Lv * (Lv + 1) * (Lv + 2) - 120) / 6; - double const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; - SCALAR_TYPE tx, ty, tz, tw; - SCALAR_TYPE const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); - - mv = 4 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 0 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 0 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t0 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t0 = SCALAR_MUL(t0, const_value_w); - tz = SCALAR_FMA(ty, t0, tz); - tw = SCALAR_FMA(tx, t0, tw); - SCALAR_STORE((Gik + 0 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 1 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 1 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t1 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t1 = SCALAR_MUL(t1, const_value_w); - tz = SCALAR_FMA(ty, t1, tz); - tw = SCALAR_FMA(tx, t1, tw); - SCALAR_STORE((Gik + 1 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 2 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 2 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t2 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t2 = SCALAR_MUL(t2, const_value_w); - tz = SCALAR_FMA(ty, t2, tz); - tw = SCALAR_FMA(tx, t2, tw); - SCALAR_STORE((Gik + 2 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 3 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 3 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t3 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t3 = SCALAR_MUL(t3, const_value_w); - tz = SCALAR_FMA(ty, t3, tz); - tw = SCALAR_FMA(tx, t3, tw); - SCALAR_STORE((Gik + 3 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 4 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 4 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t4 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t4 = SCALAR_MUL(t4, const_value_w); - tz = SCALAR_FMA(ty, t4, tz); - tw = SCALAR_FMA(tx, t4, tw); - SCALAR_STORE((Gik + 4 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 2 + p - k; - tx = SCALAR_LOAD((Xik + 5 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 5 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t5 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t5 = SCALAR_MUL(t5, const_value_w); - tz = SCALAR_FMA(ty, t5, tz); - tw = SCALAR_FMA(tx, t5, tw); - SCALAR_STORE((Gik + 5 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 6 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 6 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t6 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t6 = SCALAR_MUL(t6, const_value_w); - tz = SCALAR_FMA(ty, t6, tz); - tw = SCALAR_FMA(tx, t6, tw); - SCALAR_STORE((Gik + 6 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 7 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 7 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t7 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t7 = SCALAR_MUL(t7, const_value_w); - tz = SCALAR_FMA(ty, t7, tz); - tw = SCALAR_FMA(tx, t7, tw); - SCALAR_STORE((Gik + 7 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 2 + p - k; - tx = SCALAR_LOAD((Xik + 8 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 8 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t8 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t8 = SCALAR_MUL(t8, const_value_w); - tz = SCALAR_FMA(ty, t8, tz); - tw = SCALAR_FMA(tx, t8, tw); - SCALAR_STORE((Gik + 8 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 3 + p - k; - tx = SCALAR_LOAD((Xik + 9 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 9 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t9 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t9 = SCALAR_MUL(t9, const_value_w); - tz = SCALAR_FMA(ty, t9, tz); - tw = SCALAR_FMA(tx, t9, tw); - SCALAR_STORE((Gik + 9 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 10 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 10 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t10 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t10 = SCALAR_MUL(t10, const_value_w); - tz = SCALAR_FMA(ty, t10, tz); - tw = SCALAR_FMA(tx, t10, tw); - SCALAR_STORE((Gik + 10 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 11 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 11 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t11 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t11 = SCALAR_MUL(t11, const_value_w); - tz = SCALAR_FMA(ty, t11, tz); - tw = SCALAR_FMA(tx, t11, tw); - SCALAR_STORE((Gik + 11 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 2 + p - k; - tx = SCALAR_LOAD((Xik + 12 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 12 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t12 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t12 = SCALAR_MUL(t12, const_value_w); - tz = SCALAR_FMA(ty, t12, tz); - tw = SCALAR_FMA(tx, t12, tw); - SCALAR_STORE((Gik + 12 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 3 + p - k; - tx = SCALAR_LOAD((Xik + 13 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 13 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t13 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t13 = SCALAR_MUL(t13, const_value_w); - tz = SCALAR_FMA(ty, t13, tz); - tw = SCALAR_FMA(tx, t13, tw); - SCALAR_STORE((Gik + 13 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 4 + p - k; - tx = SCALAR_LOAD((Xik + 14 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 14 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t14 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t14 = SCALAR_MUL(t14, const_value_w); - tz = SCALAR_FMA(ty, t14, tz); - tw = SCALAR_FMA(tx, t14, tw); - SCALAR_STORE((Gik + 14 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - - Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); - rcp_k = SCALAR_RECIPROCAL(k + 1); - comb_p_k = SCALAR_MUL(comb_p_k, p - k); - comb_p_k = SCALAR_MUL(comb_p_k, rcp_k); - } - - Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); - rcp_j = SCALAR_RECIPROCAL(j + 1); - comb_n_j = SCALAR_MUL(comb_n_j, n - j); - comb_n_j = SCALAR_MUL(comb_n_j, rcp_j); - } - - X_ABp = SCALAR_MUL(X_ABp, X_AB); - rcp_i = SCALAR_RECIPROCAL(i + 1); - comb_m_i = SCALAR_MUL(comb_m_i, m - i); - comb_m_i = SCALAR_MUL(comb_m_i, rcp_i); - } - } - } + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t10 = SCALAR_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t11 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t12 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t13 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t14 = SCALAR_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 3, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t10 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t11 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t12 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t13 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t14 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(3)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t10 = SCALAR_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t11 = SCALAR_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t12 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t13 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t14 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t10 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t11 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t12 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t13 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t14 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t10 = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t11 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t12 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t13 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t14 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t10 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t11 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t12 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t13 = SCALAR_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t14 = SCALAR_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t10 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t11 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t12 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t13 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t14 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t10 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t11 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t12 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t13 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t14 = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t0 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t1 = SCALAR_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t2 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t3 = SCALAR_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t4 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t5 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t6 = SCALAR_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t7 = SCALAR_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t8 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t9 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t10 = SCALAR_LOAD((temp + 92 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t11 = SCALAR_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t12 = SCALAR_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t13 = SCALAR_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t14 = SCALAR_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 3, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t0 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t1 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t2 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t3 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t4 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t5 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t6 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t7 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t8 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t9 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t10 = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t11 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t12 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t13 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t14 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(3)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t10 = SCALAR_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t11 = SCALAR_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t12 = SCALAR_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t13 = SCALAR_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t14 = SCALAR_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t10 = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t11 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t12 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t13 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t14 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t10 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t11 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t12 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t13 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t14 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t10 = SCALAR_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t11 = SCALAR_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t12 = SCALAR_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t13 = SCALAR_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t14 = SCALAR_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t10 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t11 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t12 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t13 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t14 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t10 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t11 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t12 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t13 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t14 = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t0 = SCALAR_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t1 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t2 = SCALAR_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t3 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t4 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t5 = SCALAR_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t6 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t7 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t8 = SCALAR_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t9 = SCALAR_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t10 = SCALAR_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t11 = SCALAR_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t12 = SCALAR_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t13 = SCALAR_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t14 = SCALAR_LOAD((temp + 99 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t0 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t1 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t2 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t3 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t4 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t5 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t6 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t7 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t8 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t9 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t10 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t11 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t12 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t13 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t14 = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); } } } diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_4.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_4.cxx index 15b895e1..6a702fbf 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_4.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_4.cxx @@ -2819,222 +2819,19398 @@ void integral_4_4(size_t npts, SIMD_TYPE const_value_v = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - for(int c0 = 0; c0 <= 4; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 4 - c0; - int n = c0 - c1; - int p = c1; + double const_value, X_ABp, Y_ABp, Z_ABp, comb_m_i, comb_n_j, comb_p_k; + SIMD_TYPE const_value_w; + SIMD_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14; - int idxB = (((4 - m) * (4 - m + 1)) >> 1) + p; - - double X_ABp = 1.0, comb_m_i = 1.0; - for(int i = 0; i <= m; ++i) { - double rcp_i; - - double Y_ABp = 1.0, comb_n_j = 1.0; - for(int j = 0; j <= n; ++j) { - double rcp_j; - - double Z_ABp = 1.0, comb_p_k = 1.0; - for(int k = 0; k <= p; ++k) { - double rcp_k; - int mv, pv, Lv = 8 - i - j - k; - - int offset = (Lv * (Lv + 1) * (Lv + 2) - 120) / 6; - double const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; - SIMD_TYPE tx, ty, tz, tw; - SIMD_TYPE const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); - - mv = 4 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t0 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t0 = SIMD_MUL(t0, const_value_w); - tz = SIMD_FMA(ty, t0, tz); - tw = SIMD_FMA(tx, t0, tw); - SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t1 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t1 = SIMD_MUL(t1, const_value_w); - tz = SIMD_FMA(ty, t1, tz); - tw = SIMD_FMA(tx, t1, tw); - SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t2 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t2 = SIMD_MUL(t2, const_value_w); - tz = SIMD_FMA(ty, t2, tz); - tw = SIMD_FMA(tx, t2, tw); - SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t3 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t3 = SIMD_MUL(t3, const_value_w); - tz = SIMD_FMA(ty, t3, tz); - tw = SIMD_FMA(tx, t3, tw); - SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t4 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t4 = SIMD_MUL(t4, const_value_w); - tz = SIMD_FMA(ty, t4, tz); - tw = SIMD_FMA(tx, t4, tw); - SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t5 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t5 = SIMD_MUL(t5, const_value_w); - tz = SIMD_FMA(ty, t5, tz); - tw = SIMD_FMA(tx, t5, tw); - SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t6 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t6 = SIMD_MUL(t6, const_value_w); - tz = SIMD_FMA(ty, t6, tz); - tw = SIMD_FMA(tx, t6, tw); - SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t7 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t7 = SIMD_MUL(t7, const_value_w); - tz = SIMD_FMA(ty, t7, tz); - tw = SIMD_FMA(tx, t7, tw); - SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t8 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t8 = SIMD_MUL(t8, const_value_w); - tz = SIMD_FMA(ty, t8, tz); - tw = SIMD_FMA(tx, t8, tw); - SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t9 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t9 = SIMD_MUL(t9, const_value_w); - tz = SIMD_FMA(ty, t9, tz); - tw = SIMD_FMA(tx, t9, tw); - SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t10 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t10 = SIMD_MUL(t10, const_value_w); - tz = SIMD_FMA(ty, t10, tz); - tw = SIMD_FMA(tx, t10, tw); - SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t11 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t11 = SIMD_MUL(t11, const_value_w); - tz = SIMD_FMA(ty, t11, tz); - tw = SIMD_FMA(tx, t11, tw); - SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t12 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t12 = SIMD_MUL(t12, const_value_w); - tz = SIMD_FMA(ty, t12, tz); - tw = SIMD_FMA(tx, t12, tw); - SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t13 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t13 = SIMD_MUL(t13, const_value_w); - tz = SIMD_FMA(ty, t13, tz); - tw = SIMD_FMA(tx, t13, tw); - SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 4 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t14 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t14 = SIMD_MUL(t14, const_value_w); - tz = SIMD_FMA(ty, t14, tz); - tw = SIMD_FMA(tx, t14, tw); - SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - - Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); - rcp_k = SCALAR_RECIPROCAL(k + 1); - comb_p_k = SCALAR_MUL(comb_p_k, p - k); - comb_p_k = SCALAR_MUL(comb_p_k, rcp_k); - } - - Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); - rcp_j = SCALAR_RECIPROCAL(j + 1); - comb_n_j = SCALAR_MUL(comb_n_j, n - j); - comb_n_j = SCALAR_MUL(comb_n_j, rcp_j); - } - - X_ABp = SCALAR_MUL(X_ABp, X_AB); - rcp_i = SCALAR_RECIPROCAL(i + 1); - comb_m_i = SCALAR_MUL(comb_m_i, m - i); - comb_m_i = SCALAR_MUL(comb_m_i, rcp_i); - } - } - } + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 100 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 101 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 102 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 103 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 105 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 4, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 3, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(3)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(4)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 101 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 103 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 3, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(3)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 102 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 105 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 3, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(3)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 103 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 121 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 105 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 127 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 121 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 128 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 3, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(3)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 92 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 3, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(3)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 127 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 135 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 99 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 121 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 128 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 136 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 137 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 138 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 4, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 92 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 3, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(3)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(4)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 137 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 138 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 92 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 3, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(3)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 138 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 142 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 142 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 143 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 99 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 127 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 135 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 142 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 143 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 144 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 4, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 99 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(4)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); } } for(; p_outer < npts; p_outer += NPTS_LOCAL) { - size_t npts_inner = std::min((size_t) NPTS_LOCAL, npts - p_outer); + size_t npts_inner = std::min((size_t) NPTS_LOCAL, npts - p_outer); double *_point_outer = (_points + p_outer); double X_AB = rA.x - rB.x; @@ -8567,217 +27743,19393 @@ void integral_4_4(size_t npts, SIMD_TYPE const_value_v = SIMD_UNALIGNED_LOAD((weights + p_outer + p_inner)); - for(int c0 = 0; c0 <= 4; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 4 - c0; - int n = c0 - c1; - int p = c1; - - int idxB = (((4 - m) * (4 - m + 1)) >> 1) + p; - - double X_ABp = 1.0, comb_m_i = 1.0; - for(int i = 0; i <= m; ++i) { - double rcp_i; - - double Y_ABp = 1.0, comb_n_j = 1.0; - for(int j = 0; j <= n; ++j) { - double rcp_j; - - double Z_ABp = 1.0, comb_p_k = 1.0; - for(int k = 0; k <= p; ++k) { - double rcp_k; - int mv, pv, Lv = 8 - i - j - k; - - int offset = (Lv * (Lv + 1) * (Lv + 2) - 120) / 6; - double const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; - SIMD_TYPE tx, ty, tz, tw; - SIMD_TYPE const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + double const_value, X_ABp, Y_ABp, Z_ABp, comb_m_i, comb_n_j, comb_p_k; + SIMD_TYPE const_value_w; + SIMD_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14; - mv = 4 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t0 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t0 = SIMD_MUL(t0, const_value_w); - tz = SIMD_FMA(ty, t0, tz); - tw = SIMD_FMA(tx, t0, tw); - SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t1 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t1 = SIMD_MUL(t1, const_value_w); - tz = SIMD_FMA(ty, t1, tz); - tw = SIMD_FMA(tx, t1, tw); - SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t2 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t2 = SIMD_MUL(t2, const_value_w); - tz = SIMD_FMA(ty, t2, tz); - tw = SIMD_FMA(tx, t2, tw); - SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t3 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t3 = SIMD_MUL(t3, const_value_w); - tz = SIMD_FMA(ty, t3, tz); - tw = SIMD_FMA(tx, t3, tw); - SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t4 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t4 = SIMD_MUL(t4, const_value_w); - tz = SIMD_FMA(ty, t4, tz); - tw = SIMD_FMA(tx, t4, tw); - SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t5 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t5 = SIMD_MUL(t5, const_value_w); - tz = SIMD_FMA(ty, t5, tz); - tw = SIMD_FMA(tx, t5, tw); - SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t6 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t6 = SIMD_MUL(t6, const_value_w); - tz = SIMD_FMA(ty, t6, tz); - tw = SIMD_FMA(tx, t6, tw); - SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t7 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t7 = SIMD_MUL(t7, const_value_w); - tz = SIMD_FMA(ty, t7, tz); - tw = SIMD_FMA(tx, t7, tw); - SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t8 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t8 = SIMD_MUL(t8, const_value_w); - tz = SIMD_FMA(ty, t8, tz); - tw = SIMD_FMA(tx, t8, tw); - SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t9 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t9 = SIMD_MUL(t9, const_value_w); - tz = SIMD_FMA(ty, t9, tz); - tw = SIMD_FMA(tx, t9, tw); - SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 0 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t10 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t10 = SIMD_MUL(t10, const_value_w); - tz = SIMD_FMA(ty, t10, tz); - tw = SIMD_FMA(tx, t10, tw); - SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 1 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t11 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t11 = SIMD_MUL(t11, const_value_w); - tz = SIMD_FMA(ty, t11, tz); - tw = SIMD_FMA(tx, t11, tw); - SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 2 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t12 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t12 = SIMD_MUL(t12, const_value_w); - tz = SIMD_FMA(ty, t12, tz); - tw = SIMD_FMA(tx, t12, tw); - SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 3 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t13 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t13 = SIMD_MUL(t13, const_value_w); - tz = SIMD_FMA(ty, t13, tz); - tw = SIMD_FMA(tx, t13, tw); - SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 4 + p - k; - tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); - ty = SIMD_UNALIGNED_LOAD((Xjk + idxB * ldX)); - tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); - tw = SIMD_UNALIGNED_LOAD((Gjk + idxB * ldG)); - SIMD_TYPE t14 = SIMD_ALIGNED_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t14 = SIMD_MUL(t14, const_value_w); - tz = SIMD_FMA(ty, t14, tz); - tw = SIMD_FMA(tx, t14, tw); - SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); - SIMD_UNALIGNED_STORE((Gjk + idxB * ldG), tw); - - Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); - rcp_k = SCALAR_RECIPROCAL(k + 1); - comb_p_k = SCALAR_MUL(comb_p_k, p - k); - comb_p_k = SCALAR_MUL(comb_p_k, rcp_k); - } - - Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); - rcp_j = SCALAR_RECIPROCAL(j + 1); - comb_n_j = SCALAR_MUL(comb_n_j, n - j); - comb_n_j = SCALAR_MUL(comb_n_j, rcp_j); - } - - X_ABp = SCALAR_MUL(X_ABp, X_AB); - rcp_i = SCALAR_RECIPROCAL(i + 1); - comb_m_i = SCALAR_MUL(comb_m_i, m - i); - comb_m_i = SCALAR_MUL(comb_m_i, rcp_i); - } - } - } + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 100 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 101 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 102 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 103 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 105 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 4, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 3, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(3)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(4)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 0 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 0 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 0 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 101 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 103 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 3, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(3)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 1 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 1 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 1 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 102 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 105 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 3, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(3)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 2 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 2 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 2 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 103 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 121 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 3 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 3 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 3 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 4 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 4 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 4 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 105 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 127 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 5 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 5 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 5 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 121 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 128 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 3, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(3)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 92 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 3, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(3)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 6 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 6 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 6 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 7 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 7 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 7 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 8 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 8 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 8 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 127 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 135 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 99 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 9 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 9 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 9 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 121 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 128 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 136 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 137 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 138 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 4, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 92 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 3, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(3)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(4)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 10 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 10 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 10 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 137 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 138 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 92 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 3, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(3)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 11 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 11 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 11 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 138 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 142 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 12 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 12 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 12 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 142 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 143 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 99 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 13 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 13 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 13 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 127 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 135 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 142 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 143 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 144 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 4, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 99 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(4)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SIMD_MUL(const_value_v, SIMD_DUPLICATE(&(const_value))); + tx = SIMD_UNALIGNED_LOAD((Xik + 0 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 0 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t0 = SIMD_ALIGNED_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SIMD_MUL(t0, const_value_w); + tz = SIMD_FMA(ty, t0, tz); + tw = SIMD_FMA(tx, t0, tw); + SIMD_UNALIGNED_STORE((Gik + 0 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 1 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 1 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t1 = SIMD_ALIGNED_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SIMD_MUL(t1, const_value_w); + tz = SIMD_FMA(ty, t1, tz); + tw = SIMD_FMA(tx, t1, tw); + SIMD_UNALIGNED_STORE((Gik + 1 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 2 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 2 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t2 = SIMD_ALIGNED_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SIMD_MUL(t2, const_value_w); + tz = SIMD_FMA(ty, t2, tz); + tw = SIMD_FMA(tx, t2, tw); + SIMD_UNALIGNED_STORE((Gik + 2 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 3 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 3 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t3 = SIMD_ALIGNED_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SIMD_MUL(t3, const_value_w); + tz = SIMD_FMA(ty, t3, tz); + tw = SIMD_FMA(tx, t3, tw); + SIMD_UNALIGNED_STORE((Gik + 3 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 4 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 4 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t4 = SIMD_ALIGNED_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SIMD_MUL(t4, const_value_w); + tz = SIMD_FMA(ty, t4, tz); + tw = SIMD_FMA(tx, t4, tw); + SIMD_UNALIGNED_STORE((Gik + 4 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 5 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 5 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t5 = SIMD_ALIGNED_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SIMD_MUL(t5, const_value_w); + tz = SIMD_FMA(ty, t5, tz); + tw = SIMD_FMA(tx, t5, tw); + SIMD_UNALIGNED_STORE((Gik + 5 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 6 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 6 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t6 = SIMD_ALIGNED_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SIMD_MUL(t6, const_value_w); + tz = SIMD_FMA(ty, t6, tz); + tw = SIMD_FMA(tx, t6, tw); + SIMD_UNALIGNED_STORE((Gik + 6 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 7 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 7 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t7 = SIMD_ALIGNED_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SIMD_MUL(t7, const_value_w); + tz = SIMD_FMA(ty, t7, tz); + tw = SIMD_FMA(tx, t7, tw); + SIMD_UNALIGNED_STORE((Gik + 7 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 8 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 8 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t8 = SIMD_ALIGNED_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SIMD_MUL(t8, const_value_w); + tz = SIMD_FMA(ty, t8, tz); + tw = SIMD_FMA(tx, t8, tw); + SIMD_UNALIGNED_STORE((Gik + 8 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 9 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 9 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t9 = SIMD_ALIGNED_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SIMD_MUL(t9, const_value_w); + tz = SIMD_FMA(ty, t9, tz); + tw = SIMD_FMA(tx, t9, tw); + SIMD_UNALIGNED_STORE((Gik + 9 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 10 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 10 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t10 = SIMD_ALIGNED_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SIMD_MUL(t10, const_value_w); + tz = SIMD_FMA(ty, t10, tz); + tw = SIMD_FMA(tx, t10, tw); + SIMD_UNALIGNED_STORE((Gik + 10 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 11 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 11 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t11 = SIMD_ALIGNED_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SIMD_MUL(t11, const_value_w); + tz = SIMD_FMA(ty, t11, tz); + tw = SIMD_FMA(tx, t11, tw); + SIMD_UNALIGNED_STORE((Gik + 11 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 12 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 12 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t12 = SIMD_ALIGNED_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SIMD_MUL(t12, const_value_w); + tz = SIMD_FMA(ty, t12, tz); + tw = SIMD_FMA(tx, t12, tw); + SIMD_UNALIGNED_STORE((Gik + 12 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 13 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 13 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t13 = SIMD_ALIGNED_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SIMD_MUL(t13, const_value_w); + tz = SIMD_FMA(ty, t13, tz); + tw = SIMD_FMA(tx, t13, tw); + SIMD_UNALIGNED_STORE((Gik + 13 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); + tx = SIMD_UNALIGNED_LOAD((Xik + 14 * ldX)); + ty = SIMD_UNALIGNED_LOAD((Xjk + 14 * ldX)); + tz = SIMD_UNALIGNED_LOAD((Gik + 14 * ldG)); + tw = SIMD_UNALIGNED_LOAD((Gjk + 14 * ldG)); + t14 = SIMD_ALIGNED_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SIMD_MUL(t14, const_value_w); + tz = SIMD_FMA(ty, t14, tz); + tw = SIMD_FMA(tx, t14, tw); + SIMD_UNALIGNED_STORE((Gik + 14 * ldG), tz); + SIMD_UNALIGNED_STORE((Gjk + 14 * ldG), tw); } for(; p_inner < npts_inner; p_inner += SCALAR_LENGTH) { @@ -8788,217 +47140,19393 @@ void integral_4_4(size_t npts, SCALAR_TYPE const_value_v = SCALAR_LOAD((weights + p_outer + p_inner)); - for(int c0 = 0; c0 <= 4; ++c0) { - for(int c1 = 0; c1 <= c0; ++c1) { - int m = 4 - c0; - int n = c0 - c1; - int p = c1; - - int idxB = (((4 - m) * (4 - m + 1)) >> 1) + p; - - double X_ABp = 1.0, comb_m_i = 1.0; - for(int i = 0; i <= m; ++i) { - double rcp_i; - - double Y_ABp = 1.0, comb_n_j = 1.0; - for(int j = 0; j <= n; ++j) { - double rcp_j; + double const_value, X_ABp, Y_ABp, Z_ABp, comb_m_i, comb_n_j, comb_p_k; + SCALAR_TYPE const_value_w; + SCALAR_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14; - double Z_ABp = 1.0, comb_p_k = 1.0; - for(int k = 0; k <= p; ++k) { - double rcp_k; - int mv, pv, Lv = 8 - i - j - k; - - int offset = (Lv * (Lv + 1) * (Lv + 2) - 120) / 6; - double const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; - SCALAR_TYPE tx, ty, tz, tw; - SCALAR_TYPE const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); - - mv = 4 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 0 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 0 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t0 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t0 = SCALAR_MUL(t0, const_value_w); - tz = SCALAR_FMA(ty, t0, tz); - tw = SCALAR_FMA(tx, t0, tw); - SCALAR_STORE((Gik + 0 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 1 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 1 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t1 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t1 = SCALAR_MUL(t1, const_value_w); - tz = SCALAR_FMA(ty, t1, tz); - tw = SCALAR_FMA(tx, t1, tw); - SCALAR_STORE((Gik + 1 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 3 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 2 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 2 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t2 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t2 = SCALAR_MUL(t2, const_value_w); - tz = SCALAR_FMA(ty, t2, tz); - tw = SCALAR_FMA(tx, t2, tw); - SCALAR_STORE((Gik + 2 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 3 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 3 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t3 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t3 = SCALAR_MUL(t3, const_value_w); - tz = SCALAR_FMA(ty, t3, tz); - tw = SCALAR_FMA(tx, t3, tw); - SCALAR_STORE((Gik + 3 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 4 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 4 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t4 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t4 = SCALAR_MUL(t4, const_value_w); - tz = SCALAR_FMA(ty, t4, tz); - tw = SCALAR_FMA(tx, t4, tw); - SCALAR_STORE((Gik + 4 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 2 + m - i; pv = 2 + p - k; - tx = SCALAR_LOAD((Xik + 5 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 5 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t5 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t5 = SCALAR_MUL(t5, const_value_w); - tz = SCALAR_FMA(ty, t5, tz); - tw = SCALAR_FMA(tx, t5, tw); - SCALAR_STORE((Gik + 5 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 6 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 6 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t6 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t6 = SCALAR_MUL(t6, const_value_w); - tz = SCALAR_FMA(ty, t6, tz); - tw = SCALAR_FMA(tx, t6, tw); - SCALAR_STORE((Gik + 6 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 7 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 7 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t7 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t7 = SCALAR_MUL(t7, const_value_w); - tz = SCALAR_FMA(ty, t7, tz); - tw = SCALAR_FMA(tx, t7, tw); - SCALAR_STORE((Gik + 7 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 2 + p - k; - tx = SCALAR_LOAD((Xik + 8 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 8 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t8 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t8 = SCALAR_MUL(t8, const_value_w); - tz = SCALAR_FMA(ty, t8, tz); - tw = SCALAR_FMA(tx, t8, tw); - SCALAR_STORE((Gik + 8 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 1 + m - i; pv = 3 + p - k; - tx = SCALAR_LOAD((Xik + 9 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 9 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t9 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t9 = SCALAR_MUL(t9, const_value_w); - tz = SCALAR_FMA(ty, t9, tz); - tw = SCALAR_FMA(tx, t9, tw); - SCALAR_STORE((Gik + 9 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 0 + p - k; - tx = SCALAR_LOAD((Xik + 10 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 10 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t10 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t10 = SCALAR_MUL(t10, const_value_w); - tz = SCALAR_FMA(ty, t10, tz); - tw = SCALAR_FMA(tx, t10, tw); - SCALAR_STORE((Gik + 10 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 1 + p - k; - tx = SCALAR_LOAD((Xik + 11 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 11 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t11 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t11 = SCALAR_MUL(t11, const_value_w); - tz = SCALAR_FMA(ty, t11, tz); - tw = SCALAR_FMA(tx, t11, tw); - SCALAR_STORE((Gik + 11 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 2 + p - k; - tx = SCALAR_LOAD((Xik + 12 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 12 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t12 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t12 = SCALAR_MUL(t12, const_value_w); - tz = SCALAR_FMA(ty, t12, tz); - tw = SCALAR_FMA(tx, t12, tw); - SCALAR_STORE((Gik + 12 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 3 + p - k; - tx = SCALAR_LOAD((Xik + 13 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 13 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t13 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t13 = SCALAR_MUL(t13, const_value_w); - tz = SCALAR_FMA(ty, t13, tz); - tw = SCALAR_FMA(tx, t13, tw); - SCALAR_STORE((Gik + 13 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - mv = 0 + m - i; pv = 4 + p - k; - tx = SCALAR_LOAD((Xik + 14 * ldX)); - ty = SCALAR_LOAD((Xjk + idxB * ldX)); - tz = SCALAR_LOAD((Gik + 14 * ldG)); - tw = SCALAR_LOAD((Gjk + idxB * ldG)); - SCALAR_TYPE t14 = SCALAR_LOAD((temp + (offset + (((Lv - mv) * (Lv - mv + 1)) >> 1) + pv) * NPTS_LOCAL + p_inner)); - t14 = SCALAR_MUL(t14, const_value_w); - tz = SCALAR_FMA(ty, t14, tz); - tw = SCALAR_FMA(tx, t14, tw); - SCALAR_STORE((Gik + 14 * ldG), tz); - SCALAR_STORE((Gjk + idxB * ldG), tw); - - Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); - rcp_k = SCALAR_RECIPROCAL(k + 1); - comb_p_k = SCALAR_MUL(comb_p_k, p - k); - comb_p_k = SCALAR_MUL(comb_p_k, rcp_k); - } - - Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); - rcp_j = SCALAR_RECIPROCAL(j + 1); - comb_n_j = SCALAR_MUL(comb_n_j, n - j); - comb_n_j = SCALAR_MUL(comb_n_j, rcp_j); - } - - X_ABp = SCALAR_MUL(X_ABp, X_AB); - rcp_i = SCALAR_RECIPROCAL(i + 1); - comb_m_i = SCALAR_MUL(comb_m_i, m - i); - comb_m_i = SCALAR_MUL(comb_m_i, rcp_i); - } - } - } + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 100 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 101 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 102 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 103 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 105 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t10 = SCALAR_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t11 = SCALAR_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t12 = SCALAR_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t13 = SCALAR_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t14 = SCALAR_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 4, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t10 = SCALAR_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t11 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t12 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t13 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t14 = SCALAR_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 3, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t10 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t11 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t12 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t13 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t14 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(3)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(4)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 0 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 0 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 101 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 103 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 3, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(3)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 1 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 1 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 102 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 105 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 64 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 3, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(3)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 2 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 2 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 103 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t10 = SCALAR_LOAD((temp + 121 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t11 = SCALAR_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t12 = SCALAR_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t13 = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t14 = SCALAR_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t10 = SCALAR_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t11 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t12 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t13 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t14 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t10 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t11 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t12 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t13 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t14 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t10 = SCALAR_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t11 = SCALAR_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t12 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t13 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t14 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t10 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t11 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t12 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t13 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t14 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t10 = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t11 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t12 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t13 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t14 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 3 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 3 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 104 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 65 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 4 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 4 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 105 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t10 = SCALAR_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t11 = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t12 = SCALAR_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t13 = SCALAR_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t14 = SCALAR_LOAD((temp + 127 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 66 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t10 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t11 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t12 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t13 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t14 = SCALAR_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 36 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t10 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t11 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t12 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t13 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t14 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t10 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t11 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t12 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t13 = SCALAR_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t14 = SCALAR_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t10 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t11 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t12 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t13 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t14 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t10 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t11 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t12 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t13 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t14 = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 5 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 5 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t0 = SCALAR_LOAD((temp + 106 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t1 = SCALAR_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t2 = SCALAR_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t3 = SCALAR_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t4 = SCALAR_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t5 = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t6 = SCALAR_LOAD((temp + 121 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t7 = SCALAR_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t8 = SCALAR_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t9 = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t10 = SCALAR_LOAD((temp + 128 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t11 = SCALAR_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t12 = SCALAR_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t13 = SCALAR_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t14 = SCALAR_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 3, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t0 = SCALAR_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t1 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t2 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t3 = SCALAR_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t4 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t5 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t6 = SCALAR_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t7 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t8 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t9 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t10 = SCALAR_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t11 = SCALAR_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t12 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t13 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t14 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t0 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t1 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t2 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t3 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t4 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t5 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t6 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t7 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t8 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t9 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t10 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t11 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t12 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t13 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t14 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(3)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t0 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t1 = SCALAR_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t2 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t3 = SCALAR_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t4 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t5 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t6 = SCALAR_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t7 = SCALAR_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t8 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t9 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t10 = SCALAR_LOAD((temp + 92 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t11 = SCALAR_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t12 = SCALAR_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t13 = SCALAR_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t14 = SCALAR_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 3, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t0 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t1 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t2 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t3 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t4 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t5 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t6 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t7 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t8 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t9 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t10 = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t11 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t12 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t13 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t14 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(3)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 6 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 6 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 6 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 107 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t10 = SCALAR_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t11 = SCALAR_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t12 = SCALAR_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t13 = SCALAR_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t14 = SCALAR_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 67 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t10 = SCALAR_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t11 = SCALAR_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t12 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t13 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t14 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t10 = SCALAR_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t11 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t12 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t13 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t14 = SCALAR_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t10 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t11 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t12 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t13 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t14 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t10 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t11 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t12 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t13 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t14 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t10 = SCALAR_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t11 = SCALAR_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t12 = SCALAR_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t13 = SCALAR_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t14 = SCALAR_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t10 = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t11 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t12 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t13 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t14 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t10 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t11 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t12 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t13 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t14 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 7 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 7 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 7 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 108 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t10 = SCALAR_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t11 = SCALAR_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t12 = SCALAR_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t13 = SCALAR_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t14 = SCALAR_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 68 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t10 = SCALAR_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t11 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t12 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t13 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t14 = SCALAR_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 37 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t10 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t11 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t12 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t13 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t14 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t10 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t11 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t12 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t13 = SCALAR_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t14 = SCALAR_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t10 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t11 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t12 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t13 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t14 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t10 = SCALAR_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t11 = SCALAR_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t12 = SCALAR_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t13 = SCALAR_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t14 = SCALAR_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t10 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t11 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t12 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t13 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t14 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t10 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t11 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t12 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t13 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t14 = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 8 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 8 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 8 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t0 = SCALAR_LOAD((temp + 109 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t1 = SCALAR_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t2 = SCALAR_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t3 = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t4 = SCALAR_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t5 = SCALAR_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t6 = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t7 = SCALAR_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t8 = SCALAR_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t9 = SCALAR_LOAD((temp + 127 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t10 = SCALAR_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t11 = SCALAR_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t12 = SCALAR_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t13 = SCALAR_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t14 = SCALAR_LOAD((temp + 135 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t0 = SCALAR_LOAD((temp + 69 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t1 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t2 = SCALAR_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t3 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t4 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t5 = SCALAR_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t6 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t7 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t8 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t9 = SCALAR_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t10 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t11 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t12 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t13 = SCALAR_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t14 = SCALAR_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t0 = SCALAR_LOAD((temp + 38 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t1 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t2 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t3 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t4 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t5 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t6 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t7 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t8 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t9 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t10 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t11 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t12 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t13 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t14 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t0 = SCALAR_LOAD((temp + 15 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t1 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t2 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t3 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t4 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t5 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t6 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t7 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t8 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t9 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t10 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t11 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t12 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t13 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t14 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t0 = SCALAR_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t1 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t2 = SCALAR_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t3 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t4 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t5 = SCALAR_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t6 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t7 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t8 = SCALAR_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t9 = SCALAR_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t10 = SCALAR_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t11 = SCALAR_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t12 = SCALAR_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t13 = SCALAR_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t14 = SCALAR_LOAD((temp + 99 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t0 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t1 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t2 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t3 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t4 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t5 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t6 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t7 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t8 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t9 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t10 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t11 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t12 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t13 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t14 = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 9 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 9 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 9 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t0 = SCALAR_LOAD((temp + 110 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t1 = SCALAR_LOAD((temp + 115 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t2 = SCALAR_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t3 = SCALAR_LOAD((temp + 121 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t4 = SCALAR_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t5 = SCALAR_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t6 = SCALAR_LOAD((temp + 128 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t7 = SCALAR_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t8 = SCALAR_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t9 = SCALAR_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t10 = SCALAR_LOAD((temp + 136 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t11 = SCALAR_LOAD((temp + 137 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t12 = SCALAR_LOAD((temp + 138 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t13 = SCALAR_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t14 = SCALAR_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 4, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t0 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t1 = SCALAR_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t2 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t3 = SCALAR_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t4 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t5 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t6 = SCALAR_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t7 = SCALAR_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t8 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t9 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t10 = SCALAR_LOAD((temp + 92 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t11 = SCALAR_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t12 = SCALAR_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t13 = SCALAR_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t14 = SCALAR_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 3, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t0 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t1 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t2 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t3 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t4 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t5 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t6 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t7 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t8 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t9 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t10 = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t11 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t12 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t13 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t14 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(3)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(4)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 10 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 10 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 10 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t0 = SCALAR_LOAD((temp + 111 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t1 = SCALAR_LOAD((temp + 116 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t2 = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t3 = SCALAR_LOAD((temp + 122 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t4 = SCALAR_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t5 = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t6 = SCALAR_LOAD((temp + 129 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t7 = SCALAR_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t8 = SCALAR_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t9 = SCALAR_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t10 = SCALAR_LOAD((temp + 137 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t11 = SCALAR_LOAD((temp + 138 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t12 = SCALAR_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t13 = SCALAR_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t14 = SCALAR_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t0 = SCALAR_LOAD((temp + 70 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t1 = SCALAR_LOAD((temp + 74 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t2 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t3 = SCALAR_LOAD((temp + 79 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t4 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t5 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t6 = SCALAR_LOAD((temp + 85 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t7 = SCALAR_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t8 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t9 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t10 = SCALAR_LOAD((temp + 92 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t11 = SCALAR_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t12 = SCALAR_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t13 = SCALAR_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t14 = SCALAR_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 3, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t0 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t1 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t2 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t3 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t4 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t5 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t6 = SCALAR_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t7 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t8 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t9 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t10 = SCALAR_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t11 = SCALAR_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t12 = SCALAR_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t13 = SCALAR_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t14 = SCALAR_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t0 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t1 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t2 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t3 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t4 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t5 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t6 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t7 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t8 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t9 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t10 = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t11 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t12 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t13 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t14 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t0 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t1 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t2 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t3 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t4 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t5 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t6 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t7 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t8 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t9 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t10 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t11 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t12 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t13 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t14 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(3)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 11 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 11 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 11 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t0 = SCALAR_LOAD((temp + 112 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t1 = SCALAR_LOAD((temp + 117 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t2 = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t3 = SCALAR_LOAD((temp + 123 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t4 = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t5 = SCALAR_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t6 = SCALAR_LOAD((temp + 130 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t7 = SCALAR_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t8 = SCALAR_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t9 = SCALAR_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t10 = SCALAR_LOAD((temp + 138 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t11 = SCALAR_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t12 = SCALAR_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t13 = SCALAR_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t14 = SCALAR_LOAD((temp + 142 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t0 = SCALAR_LOAD((temp + 71 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t1 = SCALAR_LOAD((temp + 75 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t2 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t3 = SCALAR_LOAD((temp + 80 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t4 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t5 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t6 = SCALAR_LOAD((temp + 86 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t7 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t8 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t9 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t10 = SCALAR_LOAD((temp + 93 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t11 = SCALAR_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t12 = SCALAR_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t13 = SCALAR_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t14 = SCALAR_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t0 = SCALAR_LOAD((temp + 39 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t1 = SCALAR_LOAD((temp + 42 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t2 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t3 = SCALAR_LOAD((temp + 46 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t4 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t5 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t6 = SCALAR_LOAD((temp + 51 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t7 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t8 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t9 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t10 = SCALAR_LOAD((temp + 57 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t11 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t12 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t13 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t14 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t0 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t1 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t2 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t3 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t4 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t5 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t6 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t7 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t8 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t9 = SCALAR_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t10 = SCALAR_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t11 = SCALAR_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t12 = SCALAR_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t13 = SCALAR_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t14 = SCALAR_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t0 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t1 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t2 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t3 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t4 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t5 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t6 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t7 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t8 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t9 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t10 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t11 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t12 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t13 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t14 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t0 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t1 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t2 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t3 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t4 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t5 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t6 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t7 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t8 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t9 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t10 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t11 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t12 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t13 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t14 = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 12 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 12 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 12 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t0 = SCALAR_LOAD((temp + 113 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t1 = SCALAR_LOAD((temp + 118 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t2 = SCALAR_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t3 = SCALAR_LOAD((temp + 124 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t4 = SCALAR_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t5 = SCALAR_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t6 = SCALAR_LOAD((temp + 131 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t7 = SCALAR_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t8 = SCALAR_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t9 = SCALAR_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t10 = SCALAR_LOAD((temp + 139 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t11 = SCALAR_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t12 = SCALAR_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t13 = SCALAR_LOAD((temp + 142 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t14 = SCALAR_LOAD((temp + 143 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t0 = SCALAR_LOAD((temp + 72 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t1 = SCALAR_LOAD((temp + 76 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t2 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t3 = SCALAR_LOAD((temp + 81 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t4 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t5 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t6 = SCALAR_LOAD((temp + 87 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t7 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t8 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t9 = SCALAR_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t10 = SCALAR_LOAD((temp + 94 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t11 = SCALAR_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t12 = SCALAR_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t13 = SCALAR_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t14 = SCALAR_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t0 = SCALAR_LOAD((temp + 40 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t1 = SCALAR_LOAD((temp + 43 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t2 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t3 = SCALAR_LOAD((temp + 47 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t4 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t5 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t6 = SCALAR_LOAD((temp + 52 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t7 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t8 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t9 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t10 = SCALAR_LOAD((temp + 58 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t11 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t12 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t13 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t14 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t0 = SCALAR_LOAD((temp + 16 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t1 = SCALAR_LOAD((temp + 18 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t2 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t3 = SCALAR_LOAD((temp + 21 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t4 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t5 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t6 = SCALAR_LOAD((temp + 25 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t7 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t8 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t9 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t10 = SCALAR_LOAD((temp + 30 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t11 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t12 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t13 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t14 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t0 = SCALAR_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t1 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t2 = SCALAR_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t3 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t4 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t5 = SCALAR_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t6 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t7 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t8 = SCALAR_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t9 = SCALAR_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t10 = SCALAR_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t11 = SCALAR_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t12 = SCALAR_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t13 = SCALAR_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t14 = SCALAR_LOAD((temp + 99 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t0 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t1 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t2 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t3 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t4 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t5 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t6 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t7 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t8 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t9 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t10 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t11 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t12 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t13 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t14 = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 13 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 13 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 13 * ldG), tw); + X_ABp = 1.0; comb_m_i = 1.0; + Y_ABp = 1.0; comb_n_j = 1.0; + Z_ABp = 1.0; comb_p_k = 1.0; + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t0 = SCALAR_LOAD((temp + 114 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t1 = SCALAR_LOAD((temp + 119 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t2 = SCALAR_LOAD((temp + 120 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t3 = SCALAR_LOAD((temp + 125 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t4 = SCALAR_LOAD((temp + 126 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t5 = SCALAR_LOAD((temp + 127 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t6 = SCALAR_LOAD((temp + 132 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t7 = SCALAR_LOAD((temp + 133 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t8 = SCALAR_LOAD((temp + 134 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t9 = SCALAR_LOAD((temp + 135 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t10 = SCALAR_LOAD((temp + 140 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t11 = SCALAR_LOAD((temp + 141 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t12 = SCALAR_LOAD((temp + 142 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t13 = SCALAR_LOAD((temp + 143 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t14 = SCALAR_LOAD((temp + 144 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 4, SCALAR_RECIPROCAL(1)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t0 = SCALAR_LOAD((temp + 73 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t1 = SCALAR_LOAD((temp + 77 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t2 = SCALAR_LOAD((temp + 78 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t3 = SCALAR_LOAD((temp + 82 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t4 = SCALAR_LOAD((temp + 83 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t5 = SCALAR_LOAD((temp + 84 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t6 = SCALAR_LOAD((temp + 88 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t7 = SCALAR_LOAD((temp + 89 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t8 = SCALAR_LOAD((temp + 90 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t9 = SCALAR_LOAD((temp + 91 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t10 = SCALAR_LOAD((temp + 95 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t11 = SCALAR_LOAD((temp + 96 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t12 = SCALAR_LOAD((temp + 97 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t13 = SCALAR_LOAD((temp + 98 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t14 = SCALAR_LOAD((temp + 99 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 3, SCALAR_RECIPROCAL(2)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t0 = SCALAR_LOAD((temp + 41 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t1 = SCALAR_LOAD((temp + 44 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t2 = SCALAR_LOAD((temp + 45 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t3 = SCALAR_LOAD((temp + 48 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t4 = SCALAR_LOAD((temp + 49 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t5 = SCALAR_LOAD((temp + 50 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t6 = SCALAR_LOAD((temp + 53 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t7 = SCALAR_LOAD((temp + 54 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t8 = SCALAR_LOAD((temp + 55 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t9 = SCALAR_LOAD((temp + 56 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t10 = SCALAR_LOAD((temp + 59 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t11 = SCALAR_LOAD((temp + 60 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t12 = SCALAR_LOAD((temp + 61 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t13 = SCALAR_LOAD((temp + 62 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t14 = SCALAR_LOAD((temp + 63 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(3)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t0 = SCALAR_LOAD((temp + 17 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t1 = SCALAR_LOAD((temp + 19 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t2 = SCALAR_LOAD((temp + 20 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t3 = SCALAR_LOAD((temp + 22 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t4 = SCALAR_LOAD((temp + 23 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t5 = SCALAR_LOAD((temp + 24 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t6 = SCALAR_LOAD((temp + 26 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t7 = SCALAR_LOAD((temp + 27 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t8 = SCALAR_LOAD((temp + 28 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t9 = SCALAR_LOAD((temp + 29 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t10 = SCALAR_LOAD((temp + 31 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t11 = SCALAR_LOAD((temp + 32 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t12 = SCALAR_LOAD((temp + 33 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t13 = SCALAR_LOAD((temp + 34 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t14 = SCALAR_LOAD((temp + 35 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(4)); + const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; + const_value_w = SCALAR_MUL(const_value_v, SCALAR_DUPLICATE(&(const_value))); + tx = SCALAR_LOAD((Xik + 0 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 0 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t0 = SCALAR_LOAD((temp + 0 * NPTS_LOCAL + p_inner)); + t0 = SCALAR_MUL(t0, const_value_w); + tz = SCALAR_FMA(ty, t0, tz); + tw = SCALAR_FMA(tx, t0, tw); + SCALAR_STORE((Gik + 0 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 1 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 1 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t1 = SCALAR_LOAD((temp + 1 * NPTS_LOCAL + p_inner)); + t1 = SCALAR_MUL(t1, const_value_w); + tz = SCALAR_FMA(ty, t1, tz); + tw = SCALAR_FMA(tx, t1, tw); + SCALAR_STORE((Gik + 1 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 2 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 2 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t2 = SCALAR_LOAD((temp + 2 * NPTS_LOCAL + p_inner)); + t2 = SCALAR_MUL(t2, const_value_w); + tz = SCALAR_FMA(ty, t2, tz); + tw = SCALAR_FMA(tx, t2, tw); + SCALAR_STORE((Gik + 2 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 3 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 3 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t3 = SCALAR_LOAD((temp + 3 * NPTS_LOCAL + p_inner)); + t3 = SCALAR_MUL(t3, const_value_w); + tz = SCALAR_FMA(ty, t3, tz); + tw = SCALAR_FMA(tx, t3, tw); + SCALAR_STORE((Gik + 3 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 4 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 4 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t4 = SCALAR_LOAD((temp + 4 * NPTS_LOCAL + p_inner)); + t4 = SCALAR_MUL(t4, const_value_w); + tz = SCALAR_FMA(ty, t4, tz); + tw = SCALAR_FMA(tx, t4, tw); + SCALAR_STORE((Gik + 4 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 5 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 5 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t5 = SCALAR_LOAD((temp + 5 * NPTS_LOCAL + p_inner)); + t5 = SCALAR_MUL(t5, const_value_w); + tz = SCALAR_FMA(ty, t5, tz); + tw = SCALAR_FMA(tx, t5, tw); + SCALAR_STORE((Gik + 5 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 6 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 6 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t6 = SCALAR_LOAD((temp + 6 * NPTS_LOCAL + p_inner)); + t6 = SCALAR_MUL(t6, const_value_w); + tz = SCALAR_FMA(ty, t6, tz); + tw = SCALAR_FMA(tx, t6, tw); + SCALAR_STORE((Gik + 6 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 7 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 7 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t7 = SCALAR_LOAD((temp + 7 * NPTS_LOCAL + p_inner)); + t7 = SCALAR_MUL(t7, const_value_w); + tz = SCALAR_FMA(ty, t7, tz); + tw = SCALAR_FMA(tx, t7, tw); + SCALAR_STORE((Gik + 7 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 8 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 8 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t8 = SCALAR_LOAD((temp + 8 * NPTS_LOCAL + p_inner)); + t8 = SCALAR_MUL(t8, const_value_w); + tz = SCALAR_FMA(ty, t8, tz); + tw = SCALAR_FMA(tx, t8, tw); + SCALAR_STORE((Gik + 8 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 9 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 9 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t9 = SCALAR_LOAD((temp + 9 * NPTS_LOCAL + p_inner)); + t9 = SCALAR_MUL(t9, const_value_w); + tz = SCALAR_FMA(ty, t9, tz); + tw = SCALAR_FMA(tx, t9, tw); + SCALAR_STORE((Gik + 9 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 10 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 10 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t10 = SCALAR_LOAD((temp + 10 * NPTS_LOCAL + p_inner)); + t10 = SCALAR_MUL(t10, const_value_w); + tz = SCALAR_FMA(ty, t10, tz); + tw = SCALAR_FMA(tx, t10, tw); + SCALAR_STORE((Gik + 10 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 11 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 11 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t11 = SCALAR_LOAD((temp + 11 * NPTS_LOCAL + p_inner)); + t11 = SCALAR_MUL(t11, const_value_w); + tz = SCALAR_FMA(ty, t11, tz); + tw = SCALAR_FMA(tx, t11, tw); + SCALAR_STORE((Gik + 11 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 12 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 12 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t12 = SCALAR_LOAD((temp + 12 * NPTS_LOCAL + p_inner)); + t12 = SCALAR_MUL(t12, const_value_w); + tz = SCALAR_FMA(ty, t12, tz); + tw = SCALAR_FMA(tx, t12, tw); + SCALAR_STORE((Gik + 12 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 13 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 13 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t13 = SCALAR_LOAD((temp + 13 * NPTS_LOCAL + p_inner)); + t13 = SCALAR_MUL(t13, const_value_w); + tz = SCALAR_FMA(ty, t13, tz); + tw = SCALAR_FMA(tx, t13, tw); + SCALAR_STORE((Gik + 13 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); + tx = SCALAR_LOAD((Xik + 14 * ldX)); + ty = SCALAR_LOAD((Xjk + 14 * ldX)); + tz = SCALAR_LOAD((Gik + 14 * ldG)); + tw = SCALAR_LOAD((Gjk + 14 * ldG)); + t14 = SCALAR_LOAD((temp + 14 * NPTS_LOCAL + p_inner)); + t14 = SCALAR_MUL(t14, const_value_w); + tz = SCALAR_FMA(ty, t14, tz); + tw = SCALAR_FMA(tx, t14, tw); + SCALAR_STORE((Gik + 14 * ldG), tz); + SCALAR_STORE((Gjk + 14 * ldG), tw); } } } diff --git a/tests/ref_data/h2o2_def2-qzvp.hdf5 b/tests/ref_data/h2o2_def2-qzvp.hdf5 new file mode 100644 index 00000000..df736635 Binary files /dev/null and b/tests/ref_data/h2o2_def2-qzvp.hdf5 differ diff --git a/tests/ref_data/h2o2_def2-tzvp.hdf5 b/tests/ref_data/h2o2_def2-tzvp.hdf5 new file mode 100644 index 00000000..b859aa20 Binary files /dev/null and b/tests/ref_data/h2o2_def2-tzvp.hdf5 differ diff --git a/tests/xc_integrator.cxx b/tests/xc_integrator.cxx index f10501c2..ef7b0f31 100644 --- a/tests/xc_integrator.cxx +++ b/tests/xc_integrator.cxx @@ -244,6 +244,11 @@ void test_xc_integrator( ExecutionSpace ex, const RuntimeEnvironment& rt, // Check K if( has_k and check_k and rks ) { + auto max_l = basis.max_l(); + if(max_l > 2 and ex == ExecutionSpace::Device) { + std::cout << "Skiping device sn-K + L > 2" << std::endl; + return; + } auto K = integrator.eval_exx( P ); CHECK((K - K.transpose()).norm() < std::numeric_limits::epsilon()); // Symmetric CHECK( (K - K_ref).norm() / basis.nbf() < 1e-7 ); @@ -408,7 +413,7 @@ TEST_CASE( "XC Integrator", "[xc-integrator]" ) { func, PruningScheme::Robust ); } - //GKS GGA Test + // GKS GGA Test SECTION( "H3 / BLYP / cc-pvdz" ) { auto func = make_functional(blyp, pol); test_integrator(GAUXC_REF_DATA_PATH "/h3_blyp_cc-pvdz_ssf_gks.bin", @@ -421,4 +426,18 @@ TEST_CASE( "XC Integrator", "[xc-integrator]" ) { test_integrator(GAUXC_REF_DATA_PATH "/benzene_631gd_pbe0_ufg.hdf5", func, PruningScheme::Unpruned ); } + + // sn-LinK + f functions + SECTION( "H2O2 / PBE0 / def2-TZVP" ) { + auto func = make_functional(pbe0, unpol); + test_integrator(GAUXC_REF_DATA_PATH "/h2o2_def2-tzvp.hdf5", + func, PruningScheme::Unpruned ); + } + + // sn-LinK + g functions + SECTION( "H2O2 / PBE0 / def2-QZVP" ) { + auto func = make_functional(pbe0, unpol); + test_integrator(GAUXC_REF_DATA_PATH "/h2o2_def2-qzvp.hdf5", + func, PruningScheme::Unpruned ); + } }