Skip to content

Commit

Permalink
Use fused multiply add instructions in chebyshev polynomial sine
Browse files Browse the repository at this point in the history
  • Loading branch information
williamyang98 committed Jan 21, 2024
1 parent 512b6af commit ac4b013
Showing 1 changed file with 26 additions and 14 deletions.
40 changes: 26 additions & 14 deletions src/ofdm/dsp/chebyshev_sine.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,19 @@ static inline __m128 _mm_chebyshev_sine(__m128 x) {
const __m128 A4 = _mm_set1_ps(-14.049663543701171875f);
const __m128 A5 = _mm_set1_ps( 3.161602020263671875f);
// Calculate g(x) = a5*x^10 + a4*x^8 + a3*x^6 + a2*x^4 + a1*x^2 + a0
const __m128 z = _mm_mul_ps(x,x); // z = x^2
const __m128 b5 = A5; // a5*z^0
const __m128 b4 = _mm_add_ps(_mm_mul_ps(b5,z),A4); // a5*z^1 + a4*z^0
const __m128 b3 = _mm_add_ps(_mm_mul_ps(b4,z),A3); // a5*z^2 + a4*z^1 + a3*z^0
const __m128 b2 = _mm_add_ps(_mm_mul_ps(b3,z),A2); // a5*z^3 + a4*z^2 + a3*z^1 + a2*z^0
const __m128 b1 = _mm_add_ps(_mm_mul_ps(b2,z),A1); // a5*z^4 + a4*z^3 + a3*z^2 + a2*z^1 + a1*z^0
const __m128 b0 = _mm_add_ps(_mm_mul_ps(b1,z),A0); // a5*z^5 + a4*z^4 + a3*z^3 + a2*z^2 + a1*z^1 + a0*z^0
#if __FMA__
#define __muladd(a,b,c) _mm_fmadd_ps(a,b,c)
#else
#define __muladd(a,b,c) _mm_add_ps(_mm_mul_ps(a,b),c)
#endif
const __m128 z = _mm_mul_ps(x,x); // z = x^2
const __m128 b5 = A5; // a5*z^0
const __m128 b4 = __muladd(b5,z,A4); // a5*z^1 + a4*z^0
const __m128 b3 = __muladd(b4,z,A3); // a5*z^2 + a4*z^1 + a3*z^0
const __m128 b2 = __muladd(b3,z,A2); // a5*z^3 + a4*z^2 + a3*z^1 + a2*z^0
const __m128 b1 = __muladd(b2,z,A1); // a5*z^4 + a4*z^3 + a3*z^2 + a2*z^1 + a1*z^0
const __m128 b0 = __muladd(b1,z,A0); // a5*z^5 + a4*z^4 + a3*z^3 + a2*z^2 + a1*z^1 + a0*z^0
#undef __muladd
// Calculate f(x) = g(x) * (x-0.5) * (x+0.5) * x
// f(x) = g(x) * (x^2 - 0.25) * x
// f(x) = g(x) * (z-0.25) * x
Expand All @@ -63,13 +69,19 @@ static inline __m256 _mm256_chebyshev_sine(__m256 x) {
const __m256 A4 = _mm256_set1_ps(-14.049663543701171875f);
const __m256 A5 = _mm256_set1_ps( 3.161602020263671875f);
// Calculate g(x) = a5*x^10 + a4*x^8 + a3*x^6 + a2*x^4 + a1*x^2 + a0
const __m256 z = _mm256_mul_ps(x,x); // z = x^2
const __m256 b5 = A5; // a5*z^0
const __m256 b4 = _mm256_add_ps(_mm256_mul_ps(b5,z),A4); // a5*z^1 + a4*z^0
const __m256 b3 = _mm256_add_ps(_mm256_mul_ps(b4,z),A3); // a5*z^2 + a4*z^1 + a3*z^0
const __m256 b2 = _mm256_add_ps(_mm256_mul_ps(b3,z),A2); // a5*z^3 + a4*z^2 + a3*z^1 + a2*z^0
const __m256 b1 = _mm256_add_ps(_mm256_mul_ps(b2,z),A1); // a5*z^4 + a4*z^3 + a3*z^2 + a2*z^1 + a1*z^0
const __m256 b0 = _mm256_add_ps(_mm256_mul_ps(b1,z),A0); // a5*z^5 + a4*z^4 + a3*z^3 + a2*z^2 + a1*z^1 + a0*z^0
#if __FMA__
#define __muladd(a,b,c) _mm256_fmadd_ps(a,b,c)
#else
#define __muladd(a,b,c) _mm256_add_ps(_mm256_mul_ps(a,b),c)
#endif
const __m256 z = _mm256_mul_ps(x,x); // z = x^2
const __m256 b5 = A5; // a5*z^0
const __m256 b4 = __muladd(b5,z,A4); // a5*z^1 + a4*z^0
const __m256 b3 = __muladd(b4,z,A3); // a5*z^2 + a4*z^1 + a3*z^0
const __m256 b2 = __muladd(b3,z,A2); // a5*z^3 + a4*z^2 + a3*z^1 + a2*z^0
const __m256 b1 = __muladd(b2,z,A1); // a5*z^4 + a4*z^3 + a3*z^2 + a2*z^1 + a1*z^0
const __m256 b0 = __muladd(b1,z,A0); // a5*z^5 + a4*z^4 + a3*z^3 + a2*z^2 + a1*z^1 + a0*z^0
#undef __muladd
// Calculate f(x) = g(x) * (x-0.5) * (x+0.5) * x
// f(x) = g(x) * (x^2 - 0.25) * x
// f(x) = g(x) * (z-0.25) * x
Expand Down

0 comments on commit ac4b013

Please sign in to comment.