diff --git a/README.md b/README.md
index 1a8ec49..5241f91 100644
--- a/README.md
+++ b/README.md
@@ -275,16 +275,16 @@ CPU Caches:
 -------------------------------------------------------------------------------
 Benchmark                                     Time             CPU   Iterations
 -------------------------------------------------------------------------------
-BM_FFT_DSPLIB/1024/min_time:5.000          4.61 us         4.61 us      1503885
-BM_FFT_DSPLIB/1331/min_time:5.000          37.5 us         37.5 us       185462
-BM_FFT_DSPLIB/1536/min_time:5.000          12.7 us         12.7 us       533535
-BM_FFT_DSPLIB/1984/min_time:5.000          60.3 us         60.3 us       116035
-BM_FFT_DSPLIB/2048/min_time:5.000          10.4 us         10.4 us       672628
-BM_FFT_DSPLIB/4096/min_time:5.000          23.0 us         23.0 us       303742
-BM_FFT_DSPLIB/8192/min_time:5.000          53.2 us         53.2 us       131683
-BM_FFT_DSPLIB/11200/min_time:5.000          266 us          266 us        26324
-BM_FFT_DSPLIB/11202/min_time:5.000          511 us          511 us        13702
-BM_FFT_DSPLIB/16384/min_time:5.000          113 us          113 us        62225
+BM_FFT_DSPLIB/1024/min_time:5.000             4.32 us         4.32 us      1612493
+BM_FFT_DSPLIB/1331/min_time:5.000             37.6 us         37.6 us       184823
+BM_FFT_DSPLIB/1536/min_time:5.000             12.4 us         12.4 us       542987
+BM_FFT_DSPLIB/1984/min_time:5.000             60.3 us         60.3 us       112019
+BM_FFT_DSPLIB/2048/min_time:5.000             9.91 us         9.91 us       702093
+BM_FFT_DSPLIB/4096/min_time:5.000             22.1 us         22.1 us       318985
+BM_FFT_DSPLIB/8192/min_time:5.000             46.5 us         46.5 us       150514
+BM_FFT_DSPLIB/11200/min_time:5.000             264 us          264 us        26518
+BM_FFT_DSPLIB/11202/min_time:5.000             481 us          481 us        14543
+BM_FFT_DSPLIB/16384/min_time:5.000             100 us          100 us        69869
 
 BM_FFTW3_DOUBLE/1024/min_time:5.000        1.03 us         1.03 us      6563943
 BM_FFTW3_DOUBLE/1331/min_time:5.000        4.20 us         4.20 us      1673972
diff --git a/lib/fft/pow2-fft.cpp b/lib/fft/pow2-fft.cpp
index 56b4e64..7d0a8ac 100644
--- a/lib/fft/pow2-fft.cpp
+++ b/lib/fft/pow2-fft.cpp
@@ -30,28 +30,40 @@ std::vector<int32_t> _gen_bitrev_table(int n) noexcept {
     return res;
 }
 
-//generate exp(-1i * 2 * pi * t / n) table
+//generate modified exp(-1i * 2 * pi * t / n) table
+//{0, 0, n/2, 0, n/4, n/2, 3*n/4, 0, n/8, n/4 ...}
 std::vector<cmplx_t> _gen_coeffs_table(int n) noexcept {
     DSPLIB_ASSUME(n % 4 == 0);
     const int n4 = n / 4;
     const int n2 = n / 2;
-    const int n3 = 3 * n / 4;
-    std::vector<cmplx_t> res(n);
-    res[0] = {1, 0};
-    res[n4] = {0, -1};
-    res[n2] = {-1, 0};
-    res[n3] = {0, 1};
-    //use only first n/4 samples
+    std::vector<cmplx_t> tb(n / 2);
+
+    //calculate only first n/4 samples
+    tb[0] = {1, 0};
+    tb[n4] = {0, -1};
     for (int i = 1; i < n4; ++i) {
         const auto v = std::cos(2 * pi * i / n);
-        res[i].re = v;
-        res[n - i].re = v;
-        res[n2 + i].re = -v;
-        res[n2 - i].re = -v;
-        res[n4 + i].im = -v;
-        res[n4 - i].im = -v;
-        res[n3 + i].im = v;
-        res[n3 - i].im = v;
+        tb[i].re = v;
+        tb[n2 - i].re = -v;
+        tb[n4 + i].im = -v;
+        tb[n4 - i].im = -v;
+    }
+
+    //ensure consistent access to coefficients
+    //{0}, {0, n/2}, {0, n/4, 2*n/4, 3*n/4} ...
+    //todo: remove first {1, 0} element
+    uint32_t h = 1;
+    uint32_t m = n / 2;
+    uint32_t r = 0;
+    const uint32_t s = std::log2(n);
+    std::vector<cmplx_t> res(n);
+    for (uint32_t i = 0; i < s; ++i) {
+        for (int k = 0; k < h; ++k) {
+            res[r + k] = tb[k * m];
+        }
+        r += h;
+        h *= 2;
+        m /= 2;
     }
     return res;
 }
@@ -93,43 +105,89 @@ int Pow2FftPlan::size() const noexcept {
     return n_;
 }
 
-//TODO: add "small" implementations (2, 4, 8)
 void Pow2FftPlan::_fft(const cmplx_t* restrict in, cmplx_t* restrict out, int n) const noexcept {
     DSPLIB_ASSUME(n % 2 == 0);
-    DSPLIB_ASSUME(n >= 2);
+    DSPLIB_ASSUME(n >= 4);
     DSPLIB_ASSUME(n == (1L << l_));
 
     //reverse sampling
     _bitreverse(in, out, bitrev_.data(), n);
 
-    uint32_t h = 1;       ///< number of butterflies in clusters (and step between elements)
-    uint32_t m = n / 2;   ///< number of clusters (and step for the butterfly table)
-    uint32_t r = 2;       ///< number of elements (butterflies * 2) in clusters
-
-    const cmplx_t* restrict cf = coeffs_.data();
-    for (int i = 0; i < l_; ++i) {
-        cmplx_t* px1 = out;
-        cmplx_t* px2 = out + h;
-        DSPLIB_ASSUME(px1 != px2);
-        for (uint32_t j = 0; j < m; ++j) {
-            for (uint32_t k = 0; k < h; ++k) {
-                const cmplx_t x1 = px1[k];
-                const cmplx_t x2 = px2[k];
-                const cmplx_t p = cf[k * m] * x2;
-                px2[k].re = x1.re - p.re;
-                px2[k].im = x1.im - p.im;
-                px1[k].re = x1.re + p.re;
-                px1[k].im = x1.im + p.im;
+    //i=0
+    {
+        const uint32_t h = 1;
+        const uint32_t r = 2;
+        const uint32_t m = n / 2;
+        for (int j = 0; j < m; ++j) {
+            cmplx_t* px1 = out + (j * r);
+            cmplx_t* px2 = px1 + h;
+            const cmplx_t x1 = px1[0];
+            const cmplx_t w = px2[0];
+            px1[0].re = x1.re + w.re;
+            px1[0].im = x1.im + w.im;
+            px2[0].re = x1.re - w.re;
+            px2[0].im = x1.im - w.im;
+        }
+    }
+
+    //i=1
+    {
+        const uint32_t h = 2;
+        const uint32_t r = 4;
+        const uint32_t m = n / 4;
+        for (int j = 0; j < m; ++j) {
+            cmplx_t* px1 = out + (j * r);
+            cmplx_t* px2 = px1 + h;
+            //k=0, c={1,0}
+            {
+                const cmplx_t x1 = px1[0];
+                const cmplx_t w = px2[0];
+                px1[0].re = x1.re + w.re;
+                px1[0].im = x1.im + w.im;
+                px2[0].re = x1.re - w.re;
+                px2[0].im = x1.im - w.im;
+            }
+            //k=1, c={0,-1}
+            {
+                const cmplx_t x1 = px1[1];
+                const cmplx_t w = {px2[1].im, -px2[1].re};
+                px1[1].re = x1.re + w.re;
+                px1[1].im = x1.im + w.im;
+                px2[1].re = x1.re - w.re;
+                px2[1].im = x1.im - w.im;
             }
-            DSPLIB_ASSUME(r % 2 == 0);
-            px1 += r;
-            px2 += r;
         }
+    }
 
-        //next cascade
-        m /= 2;
-        h *= 2;
-        r *= 2;
+    for (int i = 2; i < l_; ++i) {
+        const uint32_t h = uint32_t(1) << i;
+        const uint32_t r = uint32_t(1) << (i + 1);
+        const uint32_t m = n >> (i + 1);
+        const cmplx_t* pw = coeffs_.data() + h - 1;
+        for (int j = 0; j < m; ++j) {
+            cmplx_t* px1 = out + (j * r);
+            cmplx_t* px2 = px1 + h;
+
+            //k=0, c={1,0}
+            {
+                const cmplx_t x1 = px1[0];
+                const cmplx_t w = px2[0];
+                px1[0].re = x1.re + w.re;
+                px1[0].im = x1.im + w.im;
+                px2[0].re = x1.re - w.re;
+                px2[0].im = x1.im - w.im;
+            }
+
+            //k=[1:h]
+            for (int k = 1; k < h; ++k) {
+                const cmplx_t x1 = px1[k];
+                const cmplx_t w = pw[k] * px2[k];
+                px1[k].re = x1.re + w.re;
+                px1[k].im = x1.im + w.im;
+                px2[k].re = x1.re - w.re;
+                px2[k].im = x1.im - w.im;
+            }
+        }
     }
 }