From 210bfaabfc15f6f5e5b5247740f80bf5cb8ef48e Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Thu, 31 Oct 2024 19:02:17 -0400 Subject: [PATCH] Do not merge - exmplar of simd This shows how we could do simd. Don't merge it. --- .../dsp/oscillators/ClassicOscillator.cpp | 142 +++++++++--------- src/common/globals.h | 19 ++- 2 files changed, 89 insertions(+), 72 deletions(-) diff --git a/src/common/dsp/oscillators/ClassicOscillator.cpp b/src/common/dsp/oscillators/ClassicOscillator.cpp index bc0f085e60e..f1828918f5d 100644 --- a/src/common/dsp/oscillators/ClassicOscillator.cpp +++ b/src/common/dsp/oscillators/ClassicOscillator.cpp @@ -186,10 +186,10 @@ void ClassicOscillator::init(float pitch, bool is_display, bool nonzero_init_dri first_run = true; charFilt.init(storage->getPatch().character.val.i); - osc_out = _mm_set1_ps(0.f); - osc_out2 = _mm_set1_ps(0.f); - osc_outR = _mm_set1_ps(0.f); - osc_out2R = _mm_set1_ps(0.f); + osc_out = SIMD_MM(set1_ps)(0.f); + osc_out2 = SIMD_MM(set1_ps)(0.f); + osc_outR = SIMD_MM(set1_ps)(0.f); + osc_out2R = SIMD_MM(set1_ps)(0.f); bufpos = 0; dc = 0; @@ -383,9 +383,9 @@ template void ClassicOscillator::convolute(int voice, bool stereo) */ unsigned int m = ((ipos >> 16) & 0xff) * (FIRipol_N << 1); unsigned int lipolui16 = (ipos & 0xffff); - __m128 lipol128 = _mm_setzero_ps(); - lipol128 = _mm_cvtsi32_ss(lipol128, lipolui16); - lipol128 = _mm_shuffle_ps(lipol128, lipol128, _MM_SHUFFLE(0, 0, 0, 0)); + auto lipol128 = SIMD_MM(setzero_ps)(); + lipol128 = SIMD_MM(cvtsi32_ss)(lipol128, lipolui16); + lipol128 = SIMD_MM(shuffle_ps)(lipol128, lipol128, _MM_SHUFFLE(0, 0, 0, 0)); int k; const float s = 0.99952f; @@ -495,25 +495,25 @@ template void ClassicOscillator::convolute(int voice, bool stereo) if (stereo) { - __m128 g128L = _mm_load_ss(&g); - g128L = _mm_shuffle_ps(g128L, g128L, _MM_SHUFFLE(0, 0, 0, 0)); - __m128 g128R = _mm_load_ss(&gR); - g128R = _mm_shuffle_ps(g128R, g128R, _MM_SHUFFLE(0, 0, 0, 0)); + auto g128L = SIMD_MM(load_ss)(&g); + g128L = SIMD_MM(shuffle_ps)(g128L, g128L, _MM_SHUFFLE(0, 0, 0, 0)); + auto g128R = SIMD_MM(load_ss)(&gR); + g128R = SIMD_MM(shuffle_ps)(g128R, g128R, _MM_SHUFFLE(0, 0, 0, 0)); for (k = 0; k < FIRipol_N; k += 4) { float *obfL = &oscbuffer[bufpos + k + delay]; float *obfR = &oscbufferR[bufpos + k + delay]; - __m128 obL = _mm_loadu_ps(obfL); - __m128 obR = _mm_loadu_ps(obfR); - __m128 st = _mm_load_ps(&storage->sinctable[m + k]); - __m128 so = _mm_load_ps(&storage->sinctable[m + k + FIRipol_N]); - so = _mm_mul_ps(so, lipol128); - st = _mm_add_ps(st, so); - obL = _mm_add_ps(obL, _mm_mul_ps(st, g128L)); - _mm_storeu_ps(obfL, obL); - obR = _mm_add_ps(obR, _mm_mul_ps(st, g128R)); - _mm_storeu_ps(obfR, obR); + auto obL = SIMD_MM(loadu_ps)(obfL); + auto obR = SIMD_MM(loadu_ps)(obfR); + auto st = SIMD_MM(load_ps)(&storage->sinctable[m + k]); + auto so = SIMD_MM(load_ps)(&storage->sinctable[m + k + FIRipol_N]); + so = SIMD_MM(mul_ps)(so, lipol128); + st = SIMD_MM(add_ps)(st, so); + obL = SIMD_MM(add_ps)(obL, SIMD_MM(mul_ps)(st, g128L)); + SIMD_MM(storeu_ps)(obfL, obL); + obR = SIMD_MM(add_ps)(obR, SIMD_MM(mul_ps)(st, g128R)); + SIMD_MM(storeu_ps)(obfR, obR); } } else @@ -521,22 +521,22 @@ template void ClassicOscillator::convolute(int voice, bool stereo) /* ** This is SSE for the convolution described above */ - __m128 g128 = _mm_load_ss(&g); - g128 = _mm_shuffle_ps(g128, g128, _MM_SHUFFLE(0, 0, 0, 0)); + auto g128 = SIMD_MM(load_ss)(&g); + g128 = SIMD_MM(shuffle_ps)(g128, g128, _MM_SHUFFLE(0, 0, 0, 0)); for (k = 0; k < FIRipol_N; k += 4) { float *obf = &oscbuffer[bufpos + k + delay]; // Get buffer[pos + delay + k ] - __m128 ob = _mm_loadu_ps(obf); - __m128 st = _mm_load_ps( + auto ob = SIMD_MM(loadu_ps)(obf); + auto st = SIMD_MM(load_ps)( &storage->sinctable[m + k]); // get the sinctable for our fractional position - __m128 so = - _mm_load_ps(&storage->sinctable[m + k + FIRipol_N]); // get the sinctable deriv - so = _mm_mul_ps(so, lipol128); // scale the deriv by the lipol fractional time - st = _mm_add_ps(st, so); // this is now st = sinctable + dt * dsinctable - st = _mm_mul_ps(st, g128); // so this is now the convolved difference, g * kernel - ob = _mm_add_ps(ob, st); // which we add back onto the buffer - _mm_storeu_ps(obf, ob); // and store. + auto so = + SIMD_MM(load_ps)(&storage->sinctable[m + k + FIRipol_N]); // get the sinctable deriv + so = SIMD_MM(mul_ps)(so, lipol128); // scale the deriv by the lipol fractional time + st = SIMD_MM(add_ps)(st, so); // this is now st = sinctable + dt * dsinctable + st = SIMD_MM(mul_ps)(st, g128); // so this is now the convolved difference, g * kernel + ob = SIMD_MM(add_ps)(ob, st); // which we add back onto the buffer + SIMD_MM(storeu_ps)(obf, ob); // and store. } } @@ -715,44 +715,44 @@ void ClassicOscillator::process_block(float pitch0, float drift, bool stereo, bo /* ** And the DC offset and pitch-scaled output attenuation */ - __m128 mdc = _mm_load_ss(&dc); - __m128 oa = _mm_load_ss(&out_attenuation); - oa = _mm_mul_ss(oa, _mm_load_ss(&pitchmult)); + auto mdc = SIMD_MM(load_ss)(&dc); + auto oa = SIMD_MM(load_ss)(&out_attenuation); + oa = SIMD_MM(mul_ss)(oa, SIMD_MM(load_ss)(&pitchmult)); /* ** The Coefs here are from the character filter, and are set in ::init */ - const __m128 mmone = _mm_set_ss(1.0f); - __m128 char_b0 = _mm_load_ss(&(charFilt.CoefB0)); - __m128 char_b1 = _mm_load_ss(&(charFilt.CoefB1)); - __m128 char_a1 = _mm_load_ss(&(charFilt.CoefA1)); + const auto mmone = SIMD_MM(set_ss)(1.0f); + auto char_b0 = SIMD_MM(load_ss)(&(charFilt.CoefB0)); + auto char_b1 = SIMD_MM(load_ss)(&(charFilt.CoefB1)); + auto char_a1 = SIMD_MM(load_ss)(&(charFilt.CoefA1)); for (k = 0; k < BLOCK_SIZE_OS; k++) { - __m128 dcb = _mm_load_ss(&dcbuffer[bufpos + k]); - __m128 hpf = _mm_load_ss(&hpfblock[k]); - __m128 ob = _mm_load_ss(&oscbuffer[bufpos + k]); + auto dcb = SIMD_MM(load_ss)(&dcbuffer[bufpos + k]); + auto hpf = SIMD_MM(load_ss)(&hpfblock[k]); + auto ob = SIMD_MM(load_ss)(&oscbuffer[bufpos + k]); /* ** a = prior output * HPF value */ - __m128 a = _mm_mul_ss(osc_out, hpf); + auto a = SIMD_MM(mul_ss)(osc_out, hpf); /* ** mdc += DC level */ - mdc = _mm_add_ss(mdc, dcb); + mdc = SIMD_MM(add_ss)(mdc, dcb); /* ** output buffer += DC * out attenuation */ - ob = _mm_sub_ss(ob, _mm_mul_ss(mdc, oa)); + ob = SIMD_MM(sub_ss)(ob, SIMD_MM(mul_ss)(mdc, oa)); /* ** Stow away the last output and make the new output the oscbuffer + the filter controbution */ - __m128 LastOscOut = osc_out; - osc_out = _mm_add_ss(a, ob); + auto LastOscOut = osc_out; + osc_out = SIMD_MM(add_ss)(a, ob); /* ** So at that point osc_out = a + ob; = prior_out * HPF + oscbuffer + DC * attenuation; @@ -767,37 +767,37 @@ void ClassicOscillator::process_block(float pitch0, float drift, bool stereo, bo */ osc_out2 = - _mm_add_ss(_mm_mul_ss(osc_out2, char_a1), - _mm_add_ss(_mm_mul_ss(osc_out, char_b0), _mm_mul_ss(LastOscOut, char_b1))); + SIMD_MM(add_ss)(SIMD_MM(mul_ss)(osc_out2, char_a1), + SIMD_MM(add_ss)(SIMD_MM(mul_ss)(osc_out, char_b0), SIMD_MM(mul_ss)(LastOscOut, char_b1))); /* ** And so store the output of the HPF as the output */ - _mm_store_ss(&output[k], osc_out2); + SIMD_MM(store_ss)(&output[k], osc_out2); // And do it all again if we are stereo if (stereo) { - ob = _mm_load_ss(&oscbufferR[bufpos + k]); + ob = SIMD_MM(load_ss)(&oscbufferR[bufpos + k]); - a = _mm_mul_ss(osc_outR, hpf); + a = SIMD_MM(mul_ss)(osc_outR, hpf); - ob = _mm_sub_ss(ob, _mm_mul_ss(mdc, oa)); - __m128 LastOscOutR = osc_outR; - osc_outR = _mm_add_ss(a, ob); + ob = SIMD_MM(sub_ss)(ob, SIMD_MM(mul_ss)(mdc, oa)); + auto LastOscOutR = osc_outR; + osc_outR = SIMD_MM(add_ss)(a, ob); - osc_out2R = _mm_add_ss( - _mm_mul_ss(osc_out2R, char_a1), - _mm_add_ss(_mm_mul_ss(osc_outR, char_b0), _mm_mul_ss(LastOscOutR, char_b1))); + osc_out2R = SIMD_MM(add_ss)( + SIMD_MM(mul_ss)(osc_out2R, char_a1), + SIMD_MM(add_ss)(SIMD_MM(mul_ss)(osc_outR, char_b0), SIMD_MM(mul_ss)(LastOscOutR, char_b1))); - _mm_store_ss(&outputR[k], osc_out2R); + SIMD_MM(store_ss)(&outputR[k], osc_out2R); } } /* ** Store the DC accumulation */ - _mm_store_ss(&dc, mdc); + SIMD_MM(store_ss)(&dc, mdc); /* ** And clean up and advance our buffer pointer @@ -819,24 +819,24 @@ void ClassicOscillator::process_block(float pitch0, float drift, bool stereo, bo */ if (bufpos == 0) // only needed if the new bufpos == 0 { - __m128 overlap[FIRipol_N >> 2], dcoverlap[FIRipol_N >> 2], overlapR[FIRipol_N >> 2]; - const __m128 zero = _mm_setzero_ps(); + SIMD_M128 overlap[FIRipol_N >> 2], dcoverlap[FIRipol_N >> 2], overlapR[FIRipol_N >> 2]; + const auto zero = SIMD_MM(setzero_ps)(); for (k = 0; k < (FIRipol_N); k += 4) { - overlap[k >> 2] = _mm_load_ps(&oscbuffer[OB_LENGTH + k]); - _mm_store_ps(&oscbuffer[k], overlap[k >> 2]); - _mm_store_ps(&oscbuffer[OB_LENGTH + k], zero); + overlap[k >> 2] = SIMD_MM(load_ps)(&oscbuffer[OB_LENGTH + k]); + SIMD_MM(store_ps)(&oscbuffer[k], overlap[k >> 2]); + SIMD_MM(store_ps)(&oscbuffer[OB_LENGTH + k], zero); - dcoverlap[k >> 2] = _mm_load_ps(&dcbuffer[OB_LENGTH + k]); - _mm_store_ps(&dcbuffer[k], dcoverlap[k >> 2]); - _mm_store_ps(&dcbuffer[OB_LENGTH + k], zero); + dcoverlap[k >> 2] = SIMD_MM(load_ps)(&dcbuffer[OB_LENGTH + k]); + SIMD_MM(store_ps)(&dcbuffer[k], dcoverlap[k >> 2]); + SIMD_MM(store_ps)(&dcbuffer[OB_LENGTH + k], zero); if (stereo) { - overlapR[k >> 2] = _mm_load_ps(&oscbufferR[OB_LENGTH + k]); - _mm_store_ps(&oscbufferR[k], overlapR[k >> 2]); - _mm_store_ps(&oscbufferR[OB_LENGTH + k], zero); + overlapR[k >> 2] = SIMD_MM(load_ps)(&oscbufferR[OB_LENGTH + k]); + SIMD_MM(store_ps)(&oscbufferR[k], overlapR[k >> 2]); + SIMD_MM(store_ps)(&oscbufferR[OB_LENGTH + k], zero); } } } diff --git a/src/common/globals.h b/src/common/globals.h index d2458be6fed..9ed3b38e840 100644 --- a/src/common/globals.h +++ b/src/common/globals.h @@ -33,8 +33,12 @@ static_assert(__cplusplus == 201703L, "Surge requires C++17; please update your build"); #if (defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || \ - (defined(_M_IX86_FP) && _M_IX86_FP >= 2)) //&& !defined(_M_ARM64EC) + (defined(_M_IX86_FP) && _M_IX86_FP >= 2)) || defined(_M_ARM64EC) #include + +#include +#include "simde/x86/sse2.h" + #else // With the upgrade to simde 0.8.2 and subsequent conversations // with simde maintainers, this include should work for every @@ -44,10 +48,23 @@ static_assert(__cplusplus == 201703L, "Surge requires C++17; please update your // // and just always include this in the else side #include +#ifndef SIMDE_SKIP_ENABLE_NATIVE_ALIASES #define SIMDE_ENABLE_NATIVE_ALIASES +#endif + #include "simde/x86/sse2.h" #endif +#if (defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || \ +(defined(_M_IX86_FP) && _M_IX86_FP >= 2)) // note this *doesn't* have ARM64EC +#define SIMD_MM(x) _mm_##x +#define SIMD_M128 __m128 +#else +#define SIMD_MM(x) simde_mm_##x +#define SIMD_M128 simde__m128 +#endif + + #if MAC || LINUX #include