diff --git a/README.md b/README.md index 1a8ec49..5241f91 100644 --- a/README.md +++ b/README.md @@ -275,16 +275,16 @@ CPU Caches: ------------------------------------------------------------------------------- Benchmark Time CPU Iterations ------------------------------------------------------------------------------- -BM_FFT_DSPLIB/1024/min_time:5.000 4.61 us 4.61 us 1503885 -BM_FFT_DSPLIB/1331/min_time:5.000 37.5 us 37.5 us 185462 -BM_FFT_DSPLIB/1536/min_time:5.000 12.7 us 12.7 us 533535 -BM_FFT_DSPLIB/1984/min_time:5.000 60.3 us 60.3 us 116035 -BM_FFT_DSPLIB/2048/min_time:5.000 10.4 us 10.4 us 672628 -BM_FFT_DSPLIB/4096/min_time:5.000 23.0 us 23.0 us 303742 -BM_FFT_DSPLIB/8192/min_time:5.000 53.2 us 53.2 us 131683 -BM_FFT_DSPLIB/11200/min_time:5.000 266 us 266 us 26324 -BM_FFT_DSPLIB/11202/min_time:5.000 511 us 511 us 13702 -BM_FFT_DSPLIB/16384/min_time:5.000 113 us 113 us 62225 +BM_FFT_DSPLIB/1024/min_time:5.000 4.32 us 4.32 us 1612493 +BM_FFT_DSPLIB/1331/min_time:5.000 37.6 us 37.6 us 184823 +BM_FFT_DSPLIB/1536/min_time:5.000 12.4 us 12.4 us 542987 +BM_FFT_DSPLIB/1984/min_time:5.000 60.3 us 60.3 us 112019 +BM_FFT_DSPLIB/2048/min_time:5.000 9.91 us 9.91 us 702093 +BM_FFT_DSPLIB/4096/min_time:5.000 22.1 us 22.1 us 318985 +BM_FFT_DSPLIB/8192/min_time:5.000 46.5 us 46.5 us 150514 +BM_FFT_DSPLIB/11200/min_time:5.000 264 us 264 us 26518 +BM_FFT_DSPLIB/11202/min_time:5.000 481 us 481 us 14543 +BM_FFT_DSPLIB/16384/min_time:5.000 100 us 100 us 69869 BM_FFTW3_DOUBLE/1024/min_time:5.000 1.03 us 1.03 us 6563943 BM_FFTW3_DOUBLE/1331/min_time:5.000 4.20 us 4.20 us 1673972 diff --git a/lib/fft/pow2-fft.cpp b/lib/fft/pow2-fft.cpp index 56b4e64..7d0a8ac 100644 --- a/lib/fft/pow2-fft.cpp +++ b/lib/fft/pow2-fft.cpp @@ -30,28 +30,40 @@ std::vector _gen_bitrev_table(int n) noexcept { return res; } -//generate exp(-1i * 2 * pi * t / n) table +//generate modified exp(-1i * 2 * pi * t / n) table +//{0, 0, n/2, 0, n/4, n/2, 3*n/4, 0, n/8, n/4 ...} std::vector _gen_coeffs_table(int n) noexcept { DSPLIB_ASSUME(n % 4 == 0); const int n4 = n / 4; const int n2 = n / 2; - const int n3 = 3 * n / 4; - std::vector res(n); - res[0] = {1, 0}; - res[n4] = {0, -1}; - res[n2] = {-1, 0}; - res[n3] = {0, 1}; - //use only first n/4 samples + std::vector tb(n / 2); + + //calculate only first n/4 samples + tb[0] = {1, 0}; + tb[n4] = {0, -1}; for (int i = 1; i < n4; ++i) { const auto v = std::cos(2 * pi * i / n); - res[i].re = v; - res[n - i].re = v; - res[n2 + i].re = -v; - res[n2 - i].re = -v; - res[n4 + i].im = -v; - res[n4 - i].im = -v; - res[n3 + i].im = v; - res[n3 - i].im = v; + tb[i].re = v; + tb[n2 - i].re = -v; + tb[n4 + i].im = -v; + tb[n4 - i].im = -v; + } + + //ensure consistent access to coefficients + //{0}, {0, n/2}, {0, n/4, 2*n/4, 3*n/4} ... + //todo: remove first {1, 0} element + uint32_t h = 1; + uint32_t m = n / 2; + uint32_t r = 0; + const uint32_t s = std::log2(n); + std::vector res(n); + for (uint32_t i = 0; i < s; ++i) { + for (int k = 0; k < h; ++k) { + res[r + k] = tb[k * m]; + } + r += h; + h *= 2; + m /= 2; } return res; } @@ -93,43 +105,89 @@ int Pow2FftPlan::size() const noexcept { return n_; } -//TODO: add "small" implementations (2, 4, 8) void Pow2FftPlan::_fft(const cmplx_t* restrict in, cmplx_t* restrict out, int n) const noexcept { DSPLIB_ASSUME(n % 2 == 0); - DSPLIB_ASSUME(n >= 2); + DSPLIB_ASSUME(n >= 4); DSPLIB_ASSUME(n == (1L << l_)); //reverse sampling _bitreverse(in, out, bitrev_.data(), n); - uint32_t h = 1; ///< number of butterflies in clusters (and step between elements) - uint32_t m = n / 2; ///< number of clusters (and step for the butterfly table) - uint32_t r = 2; ///< number of elements (butterflies * 2) in clusters - - const cmplx_t* restrict cf = coeffs_.data(); - for (int i = 0; i < l_; ++i) { - cmplx_t* px1 = out; - cmplx_t* px2 = out + h; - DSPLIB_ASSUME(px1 != px2); - for (uint32_t j = 0; j < m; ++j) { - for (uint32_t k = 0; k < h; ++k) { - const cmplx_t x1 = px1[k]; - const cmplx_t x2 = px2[k]; - const cmplx_t p = cf[k * m] * x2; - px2[k].re = x1.re - p.re; - px2[k].im = x1.im - p.im; - px1[k].re = x1.re + p.re; - px1[k].im = x1.im + p.im; + //i=0 + { + const uint32_t h = 1; + const uint32_t r = 2; + const uint32_t m = n / 2; + for (int j = 0; j < m; ++j) { + cmplx_t* px1 = out + (j * r); + cmplx_t* px2 = px1 + h; + const cmplx_t x1 = px1[0]; + const cmplx_t w = px2[0]; + px1[0].re = x1.re + w.re; + px1[0].im = x1.im + w.im; + px2[0].re = x1.re - w.re; + px2[0].im = x1.im - w.im; + } + } + + //i=1 + { + const uint32_t h = 2; + const uint32_t r = 4; + const uint32_t m = n / 4; + for (int j = 0; j < m; ++j) { + cmplx_t* px1 = out + (j * r); + cmplx_t* px2 = px1 + h; + //k=0, c={1,0} + { + const cmplx_t x1 = px1[0]; + const cmplx_t w = px2[0]; + px1[0].re = x1.re + w.re; + px1[0].im = x1.im + w.im; + px2[0].re = x1.re - w.re; + px2[0].im = x1.im - w.im; + } + //k=1, c={0,-1} + { + const cmplx_t x1 = px1[1]; + const cmplx_t w = {px2[1].im, -px2[1].re}; + px1[1].re = x1.re + w.re; + px1[1].im = x1.im + w.im; + px2[1].re = x1.re - w.re; + px2[1].im = x1.im - w.im; } - DSPLIB_ASSUME(r % 2 == 0); - px1 += r; - px2 += r; } + } - //next cascade - m /= 2; - h *= 2; - r *= 2; + for (int i = 2; i < l_; ++i) { + const uint32_t h = uint32_t(1) << i; + const uint32_t r = uint32_t(1) << (i + 1); + const uint32_t m = n >> (i + 1); + const cmplx_t* pw = coeffs_.data() + h - 1; + for (int j = 0; j < m; ++j) { + cmplx_t* px1 = out + (j * r); + cmplx_t* px2 = px1 + h; + + //k=0, c={1,0} + { + const cmplx_t x1 = px1[0]; + const cmplx_t w = px2[0]; + px1[0].re = x1.re + w.re; + px1[0].im = x1.im + w.im; + px2[0].re = x1.re - w.re; + px2[0].im = x1.im - w.im; + } + + //k=[1:h] + for (int k = 1; k < h; ++k) { + const cmplx_t x1 = px1[k]; + const cmplx_t w = pw[k] * px2[k]; + px1[k].re = x1.re + w.re; + px1[k].im = x1.im + w.im; + px2[k].re = x1.re - w.re; + px2[k].im = x1.im - w.im; + } + } } }