Skip to content

Commit

Permalink
Do not merge - exmplar of simd
Browse files Browse the repository at this point in the history
This shows how we could do simd. Don't merge it.
  • Loading branch information
baconpaul committed Oct 31, 2024
1 parent d9c1cc4 commit 210bfaa
Show file tree
Hide file tree
Showing 2 changed files with 89 additions and 72 deletions.
142 changes: 71 additions & 71 deletions src/common/dsp/oscillators/ClassicOscillator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,10 +186,10 @@ void ClassicOscillator::init(float pitch, bool is_display, bool nonzero_init_dri
first_run = true;
charFilt.init(storage->getPatch().character.val.i);

osc_out = _mm_set1_ps(0.f);
osc_out2 = _mm_set1_ps(0.f);
osc_outR = _mm_set1_ps(0.f);
osc_out2R = _mm_set1_ps(0.f);
osc_out = SIMD_MM(set1_ps)(0.f);
osc_out2 = SIMD_MM(set1_ps)(0.f);
osc_outR = SIMD_MM(set1_ps)(0.f);
osc_out2R = SIMD_MM(set1_ps)(0.f);
bufpos = 0;
dc = 0;

Expand Down Expand Up @@ -383,9 +383,9 @@ template <bool FM> void ClassicOscillator::convolute(int voice, bool stereo)
*/
unsigned int m = ((ipos >> 16) & 0xff) * (FIRipol_N << 1);
unsigned int lipolui16 = (ipos & 0xffff);
__m128 lipol128 = _mm_setzero_ps();
lipol128 = _mm_cvtsi32_ss(lipol128, lipolui16);
lipol128 = _mm_shuffle_ps(lipol128, lipol128, _MM_SHUFFLE(0, 0, 0, 0));
auto lipol128 = SIMD_MM(setzero_ps)();
lipol128 = SIMD_MM(cvtsi32_ss)(lipol128, lipolui16);
lipol128 = SIMD_MM(shuffle_ps)(lipol128, lipol128, _MM_SHUFFLE(0, 0, 0, 0));

int k;
const float s = 0.99952f;
Expand Down Expand Up @@ -495,48 +495,48 @@ template <bool FM> void ClassicOscillator::convolute(int voice, bool stereo)

if (stereo)
{
__m128 g128L = _mm_load_ss(&g);
g128L = _mm_shuffle_ps(g128L, g128L, _MM_SHUFFLE(0, 0, 0, 0));
__m128 g128R = _mm_load_ss(&gR);
g128R = _mm_shuffle_ps(g128R, g128R, _MM_SHUFFLE(0, 0, 0, 0));
auto g128L = SIMD_MM(load_ss)(&g);
g128L = SIMD_MM(shuffle_ps)(g128L, g128L, _MM_SHUFFLE(0, 0, 0, 0));
auto g128R = SIMD_MM(load_ss)(&gR);
g128R = SIMD_MM(shuffle_ps)(g128R, g128R, _MM_SHUFFLE(0, 0, 0, 0));

for (k = 0; k < FIRipol_N; k += 4)
{
float *obfL = &oscbuffer[bufpos + k + delay];
float *obfR = &oscbufferR[bufpos + k + delay];
__m128 obL = _mm_loadu_ps(obfL);
__m128 obR = _mm_loadu_ps(obfR);
__m128 st = _mm_load_ps(&storage->sinctable[m + k]);
__m128 so = _mm_load_ps(&storage->sinctable[m + k + FIRipol_N]);
so = _mm_mul_ps(so, lipol128);
st = _mm_add_ps(st, so);
obL = _mm_add_ps(obL, _mm_mul_ps(st, g128L));
_mm_storeu_ps(obfL, obL);
obR = _mm_add_ps(obR, _mm_mul_ps(st, g128R));
_mm_storeu_ps(obfR, obR);
auto obL = SIMD_MM(loadu_ps)(obfL);
auto obR = SIMD_MM(loadu_ps)(obfR);
auto st = SIMD_MM(load_ps)(&storage->sinctable[m + k]);
auto so = SIMD_MM(load_ps)(&storage->sinctable[m + k + FIRipol_N]);
so = SIMD_MM(mul_ps)(so, lipol128);
st = SIMD_MM(add_ps)(st, so);
obL = SIMD_MM(add_ps)(obL, SIMD_MM(mul_ps)(st, g128L));
SIMD_MM(storeu_ps)(obfL, obL);
obR = SIMD_MM(add_ps)(obR, SIMD_MM(mul_ps)(st, g128R));
SIMD_MM(storeu_ps)(obfR, obR);
}
}
else
{
/*
** This is SSE for the convolution described above
*/
__m128 g128 = _mm_load_ss(&g);
g128 = _mm_shuffle_ps(g128, g128, _MM_SHUFFLE(0, 0, 0, 0));
auto g128 = SIMD_MM(load_ss)(&g);
g128 = SIMD_MM(shuffle_ps)(g128, g128, _MM_SHUFFLE(0, 0, 0, 0));

for (k = 0; k < FIRipol_N; k += 4)
{
float *obf = &oscbuffer[bufpos + k + delay]; // Get buffer[pos + delay + k ]
__m128 ob = _mm_loadu_ps(obf);
__m128 st = _mm_load_ps(
auto ob = SIMD_MM(loadu_ps)(obf);
auto st = SIMD_MM(load_ps)(
&storage->sinctable[m + k]); // get the sinctable for our fractional position
__m128 so =
_mm_load_ps(&storage->sinctable[m + k + FIRipol_N]); // get the sinctable deriv
so = _mm_mul_ps(so, lipol128); // scale the deriv by the lipol fractional time
st = _mm_add_ps(st, so); // this is now st = sinctable + dt * dsinctable
st = _mm_mul_ps(st, g128); // so this is now the convolved difference, g * kernel
ob = _mm_add_ps(ob, st); // which we add back onto the buffer
_mm_storeu_ps(obf, ob); // and store.
auto so =
SIMD_MM(load_ps)(&storage->sinctable[m + k + FIRipol_N]); // get the sinctable deriv
so = SIMD_MM(mul_ps)(so, lipol128); // scale the deriv by the lipol fractional time
st = SIMD_MM(add_ps)(st, so); // this is now st = sinctable + dt * dsinctable
st = SIMD_MM(mul_ps)(st, g128); // so this is now the convolved difference, g * kernel
ob = SIMD_MM(add_ps)(ob, st); // which we add back onto the buffer
SIMD_MM(storeu_ps)(obf, ob); // and store.
}
}

Expand Down Expand Up @@ -715,44 +715,44 @@ void ClassicOscillator::process_block(float pitch0, float drift, bool stereo, bo
/*
** And the DC offset and pitch-scaled output attenuation
*/
__m128 mdc = _mm_load_ss(&dc);
__m128 oa = _mm_load_ss(&out_attenuation);
oa = _mm_mul_ss(oa, _mm_load_ss(&pitchmult));
auto mdc = SIMD_MM(load_ss)(&dc);
auto oa = SIMD_MM(load_ss)(&out_attenuation);
oa = SIMD_MM(mul_ss)(oa, SIMD_MM(load_ss)(&pitchmult));

/*
** The Coefs here are from the character filter, and are set in ::init
*/
const __m128 mmone = _mm_set_ss(1.0f);
__m128 char_b0 = _mm_load_ss(&(charFilt.CoefB0));
__m128 char_b1 = _mm_load_ss(&(charFilt.CoefB1));
__m128 char_a1 = _mm_load_ss(&(charFilt.CoefA1));
const auto mmone = SIMD_MM(set_ss)(1.0f);
auto char_b0 = SIMD_MM(load_ss)(&(charFilt.CoefB0));
auto char_b1 = SIMD_MM(load_ss)(&(charFilt.CoefB1));
auto char_a1 = SIMD_MM(load_ss)(&(charFilt.CoefA1));

for (k = 0; k < BLOCK_SIZE_OS; k++)
{
__m128 dcb = _mm_load_ss(&dcbuffer[bufpos + k]);
__m128 hpf = _mm_load_ss(&hpfblock[k]);
__m128 ob = _mm_load_ss(&oscbuffer[bufpos + k]);
auto dcb = SIMD_MM(load_ss)(&dcbuffer[bufpos + k]);
auto hpf = SIMD_MM(load_ss)(&hpfblock[k]);
auto ob = SIMD_MM(load_ss)(&oscbuffer[bufpos + k]);

/*
** a = prior output * HPF value
*/
__m128 a = _mm_mul_ss(osc_out, hpf);
auto a = SIMD_MM(mul_ss)(osc_out, hpf);

/*
** mdc += DC level
*/
mdc = _mm_add_ss(mdc, dcb);
mdc = SIMD_MM(add_ss)(mdc, dcb);

/*
** output buffer += DC * out attenuation
*/
ob = _mm_sub_ss(ob, _mm_mul_ss(mdc, oa));
ob = SIMD_MM(sub_ss)(ob, SIMD_MM(mul_ss)(mdc, oa));

/*
** Stow away the last output and make the new output the oscbuffer + the filter controbution
*/
__m128 LastOscOut = osc_out;
osc_out = _mm_add_ss(a, ob);
auto LastOscOut = osc_out;
osc_out = SIMD_MM(add_ss)(a, ob);

/*
** So at that point osc_out = a + ob; = prior_out * HPF + oscbuffer + DC * attenuation;
Expand All @@ -767,37 +767,37 @@ void ClassicOscillator::process_block(float pitch0, float drift, bool stereo, bo
*/

osc_out2 =
_mm_add_ss(_mm_mul_ss(osc_out2, char_a1),
_mm_add_ss(_mm_mul_ss(osc_out, char_b0), _mm_mul_ss(LastOscOut, char_b1)));
SIMD_MM(add_ss)(SIMD_MM(mul_ss)(osc_out2, char_a1),
SIMD_MM(add_ss)(SIMD_MM(mul_ss)(osc_out, char_b0), SIMD_MM(mul_ss)(LastOscOut, char_b1)));

/*
** And so store the output of the HPF as the output
*/
_mm_store_ss(&output[k], osc_out2);
SIMD_MM(store_ss)(&output[k], osc_out2);

// And do it all again if we are stereo
if (stereo)
{
ob = _mm_load_ss(&oscbufferR[bufpos + k]);
ob = SIMD_MM(load_ss)(&oscbufferR[bufpos + k]);

a = _mm_mul_ss(osc_outR, hpf);
a = SIMD_MM(mul_ss)(osc_outR, hpf);

ob = _mm_sub_ss(ob, _mm_mul_ss(mdc, oa));
__m128 LastOscOutR = osc_outR;
osc_outR = _mm_add_ss(a, ob);
ob = SIMD_MM(sub_ss)(ob, SIMD_MM(mul_ss)(mdc, oa));
auto LastOscOutR = osc_outR;
osc_outR = SIMD_MM(add_ss)(a, ob);

osc_out2R = _mm_add_ss(
_mm_mul_ss(osc_out2R, char_a1),
_mm_add_ss(_mm_mul_ss(osc_outR, char_b0), _mm_mul_ss(LastOscOutR, char_b1)));
osc_out2R = SIMD_MM(add_ss)(
SIMD_MM(mul_ss)(osc_out2R, char_a1),
SIMD_MM(add_ss)(SIMD_MM(mul_ss)(osc_outR, char_b0), SIMD_MM(mul_ss)(LastOscOutR, char_b1)));

_mm_store_ss(&outputR[k], osc_out2R);
SIMD_MM(store_ss)(&outputR[k], osc_out2R);
}
}

/*
** Store the DC accumulation
*/
_mm_store_ss(&dc, mdc);
SIMD_MM(store_ss)(&dc, mdc);

/*
** And clean up and advance our buffer pointer
Expand All @@ -819,24 +819,24 @@ void ClassicOscillator::process_block(float pitch0, float drift, bool stereo, bo
*/
if (bufpos == 0) // only needed if the new bufpos == 0
{
__m128 overlap[FIRipol_N >> 2], dcoverlap[FIRipol_N >> 2], overlapR[FIRipol_N >> 2];
const __m128 zero = _mm_setzero_ps();
SIMD_M128 overlap[FIRipol_N >> 2], dcoverlap[FIRipol_N >> 2], overlapR[FIRipol_N >> 2];
const auto zero = SIMD_MM(setzero_ps)();

for (k = 0; k < (FIRipol_N); k += 4)
{
overlap[k >> 2] = _mm_load_ps(&oscbuffer[OB_LENGTH + k]);
_mm_store_ps(&oscbuffer[k], overlap[k >> 2]);
_mm_store_ps(&oscbuffer[OB_LENGTH + k], zero);
overlap[k >> 2] = SIMD_MM(load_ps)(&oscbuffer[OB_LENGTH + k]);
SIMD_MM(store_ps)(&oscbuffer[k], overlap[k >> 2]);
SIMD_MM(store_ps)(&oscbuffer[OB_LENGTH + k], zero);

dcoverlap[k >> 2] = _mm_load_ps(&dcbuffer[OB_LENGTH + k]);
_mm_store_ps(&dcbuffer[k], dcoverlap[k >> 2]);
_mm_store_ps(&dcbuffer[OB_LENGTH + k], zero);
dcoverlap[k >> 2] = SIMD_MM(load_ps)(&dcbuffer[OB_LENGTH + k]);
SIMD_MM(store_ps)(&dcbuffer[k], dcoverlap[k >> 2]);
SIMD_MM(store_ps)(&dcbuffer[OB_LENGTH + k], zero);

if (stereo)
{
overlapR[k >> 2] = _mm_load_ps(&oscbufferR[OB_LENGTH + k]);
_mm_store_ps(&oscbufferR[k], overlapR[k >> 2]);
_mm_store_ps(&oscbufferR[OB_LENGTH + k], zero);
overlapR[k >> 2] = SIMD_MM(load_ps)(&oscbufferR[OB_LENGTH + k]);
SIMD_MM(store_ps)(&oscbufferR[k], overlapR[k >> 2]);
SIMD_MM(store_ps)(&oscbufferR[OB_LENGTH + k], zero);
}
}
}
Expand Down
19 changes: 18 additions & 1 deletion src/common/globals.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,12 @@
static_assert(__cplusplus == 201703L, "Surge requires C++17; please update your build");

#if (defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || \
(defined(_M_IX86_FP) && _M_IX86_FP >= 2)) //&& !defined(_M_ARM64EC)
(defined(_M_IX86_FP) && _M_IX86_FP >= 2)) || defined(_M_ARM64EC)
#include <emmintrin.h>

#include <cmath>
#include "simde/x86/sse2.h"

#else
// With the upgrade to simde 0.8.2 and subsequent conversations
// with simde maintainers, this include should work for every
Expand All @@ -44,10 +48,23 @@ static_assert(__cplusplus == 201703L, "Surge requires C++17; please update your
//
// and just always include this in the else side
#include <cmath>
#ifndef SIMDE_SKIP_ENABLE_NATIVE_ALIASES
#define SIMDE_ENABLE_NATIVE_ALIASES
#endif

#include "simde/x86/sse2.h"
#endif

#if (defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || \
(defined(_M_IX86_FP) && _M_IX86_FP >= 2)) // note this *doesn't* have ARM64EC
#define SIMD_MM(x) _mm_##x
#define SIMD_M128 __m128
#else
#define SIMD_MM(x) simde_mm_##x
#define SIMD_M128 simde__m128
#endif


#if MAC || LINUX
#include <strings.h>

Expand Down

0 comments on commit 210bfaa

Please sign in to comment.