From 051a84b78ca35ae868954324c6a16f08e485f272 Mon Sep 17 00:00:00 2001
From: Ananthakrishna Gopal
 <50374726+ananthakrishnagopal@users.noreply.github.com>
Date: Sat, 3 May 2025 00:26:28 +0530
Subject: [PATCH 01/11] Update _util.py

Added Vectorized and Numba JIT Accelerated SFWHT and Gray Code Permutations.
---
 py/fable/_util.py | 55 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/py/fable/_util.py b/py/fable/_util.py
index de39210..65033a0 100644
--- a/py/fable/_util.py
+++ b/py/fable/_util.py
@@ -30,6 +30,11 @@ def gray_permutation(a):
         b[i] = a[gray_code(i)]
     return b
 
+def gray_permutation_vectorized(a):
+    """Fast Gray code permutation using NumPy vectorization."""
+    indices = np.arange(a.shape[0])
+    return a[indices ^ (indices >> 1)]
+
 
 def sfwht(a):
     '''Scaled Fast Walsh-Hadamard transform of input vector a.
@@ -51,7 +56,57 @@ def sfwht(a):
                 a[j + 2**h] = (x - y) / 2
     return a
 
+from numba import njit
+
+@njit
+def sfwht_numba(a):
+     """
+    Numba Accelerated JIT SFWHT, can beat the vectorized version by almost 4x, and naive version by 400x
+    
+    Args:
+        a (np.ndarray): Input vector of size 2^n.
+
+    Returns:
+        np.ndarray: Scaled Walsh-Hadamard transform of `a`.
+    """
+    n = int(np.log2(a.shape[0]))
+    N = a.shape[0]
+    for h in range(n):
+        mh = 1 << (h + 1)
+        m = mh // 2
+        for i in range(0, N, mh):
+            for j in range(i, i + m):
+                x = a[j]
+                y = a[j + m]
+                a[j] = (x + y) / 2
+                a[j + m] = (x - y) / 2
+    return a
+
+def sfwht_optimized_vectorized(a):
+    """
+    Fully vectorized SFWHT using 2D reshaping to eliminate Python loops. Almost 50-100x faster than naive version
 
+    Args:
+        a (np.ndarray): Input vector of size 2^n.
+
+    Returns:
+        np.ndarray: Scaled Walsh-Hadamard transform of `a`.
+    """
+    n = int(np.log2(a.shape[0]))
+    N = a.shape[0]
+    a = a.copy().astype(np.float64)
+
+    for h in range(n):
+        mh = 1 << (h + 1)
+        m = mh // 2
+        a = a.reshape(-1, mh)
+        a[:, :m] = a[:, :m] + a[:, m:]
+        a[:, m:] = a[:, :m] - 2 * a[:, m:]
+        a = a.flatten()
+
+    a /= N
+    return a
+    
 def compute_control(i, n):
     '''Compute the control qubit index based on the index i and size n.'''
     if i == 4**n:

From 1cb5fabd06a046ee5ed2de3448d5ca07e878989c Mon Sep 17 00:00:00 2001
From: Ananthakrishna Gopal
 <50374726+ananthakrishnagopal@users.noreply.github.com>
Date: Sat, 3 May 2025 00:30:54 +0530
Subject: [PATCH 02/11] Update fable.py

added the vectorized gray permutation and sfwht numba
---
 py/fable/fable.py | 94 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 93 insertions(+), 1 deletion(-)

diff --git a/py/fable/fable.py b/py/fable/fable.py
index 80b3936..f581ebe 100644
--- a/py/fable/fable.py
+++ b/py/fable/fable.py
@@ -2,7 +2,99 @@
 # -*- coding: utf-8 -*-
 import numpy as np
 from qiskit import QuantumCircuit
-from ._util import compressed_uniform_rotation, sfwht, gray_permutation
+from ._util import compressed_uniform_rotation, sfwht, gray_permutation,sfwht_numba,gray_permutation_vectorized
+
+
+def faster_fable(a, epsilon=None):
+    '''FABLE - Fast Approximate BLock Encodings.
+
+    Args:
+        a: array
+            matrix to be block encoded.
+        epsilon: float >= 0
+            (optional) compression threshold.
+    Returns:
+        circuit: qiskit circuit
+            circuit that block encodes A
+        alpha: float
+            subnormalization factor
+    '''
+    epsm = np.finfo(a.dtype).eps
+    alpha = np.linalg.norm(np.ravel(a), np.inf)
+    if alpha > 1:
+        alpha = alpha + np.sqrt(epsm)
+        a = a/alpha
+    else:
+        alpha = 1.0
+
+    n, m = a.shape
+    if n != m:
+        k = max(n, m)
+        a = np.pad(a, ((0, k - n), (0, k - m)))
+        n = k
+    logn = int(np.ceil(np.log2(n)))
+    if n < 2**logn:
+        a = np.pad(a, ((0, 2**logn - n), (0, 2**logn - n)))
+        n = 2**logn
+
+    a = np.ravel(a)
+
+    if all(np.abs(np.imag(a)) < epsm):  # real data
+        a = gray_permutation_vectorized(
+                sfwht_numba(
+                    2.0 * np.arccos(np.real(a))
+                )
+            )
+        # threshold the vector
+        if epsilon:
+            a[abs(a) <= epsilon] = 0
+        # compute circuit
+        OA = compressed_uniform_rotation(a)
+    else:  # complex data
+        # magnitude
+        a_m = gray_permutation_vectorized(
+                sfwht_numba(
+                    2.0 * np.arccos(np.abs(a))
+                )
+            )
+        if epsilon:
+            a_m[abs(a_m) <= epsilon] = 0
+
+        # phase
+        a_p = gray_permutation_vectorized(
+                sfwht_numba(
+                    -2.0 * np.angle(a)
+                )
+            )
+        if epsilon:
+            a_p[abs(a_p) <= epsilon] = 0
+
+        # compute circuit
+        OA = compressed_uniform_rotation(a_m).compose(
+                compressed_uniform_rotation(a_p, ry=False)
+            )
+
+    circ = QuantumCircuit(2*logn + 1)
+
+    # diffusion on row indices
+    for i in range(logn):
+        circ.h(i+1)
+
+    # matrix oracle
+    circ = circ.compose(OA)
+
+    # swap register
+    for i in range(logn):
+        circ.swap(i+1,  i+logn+1)
+
+    # diffusion on row indices
+    for i in range(logn):
+        circ.h(i+1)
+
+    # reverse bits because of little-endiannes
+    circ = circ.reverse_bits()
+
+    return circ, alpha
 
 
 def fable(a, epsilon=None):

From 4cfe4e994b121a9d7ff19054ed2ea323ab9ee8db Mon Sep 17 00:00:00 2001
From: Ananthakrishna Gopal
 <50374726+ananthakrishnagopal@users.noreply.github.com>
Date: Sat, 3 May 2025 00:35:58 +0530
Subject: [PATCH 03/11] Update _util.py

removed a leftover copy
---
 py/fable/_util.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/py/fable/_util.py b/py/fable/_util.py
index 65033a0..29dc111 100644
--- a/py/fable/_util.py
+++ b/py/fable/_util.py
@@ -94,7 +94,6 @@ def sfwht_optimized_vectorized(a):
     """
     n = int(np.log2(a.shape[0]))
     N = a.shape[0]
-    a = a.copy().astype(np.float64)
 
     for h in range(n):
         mh = 1 << (h + 1)

From f63cf533a8081352ceb74bd8716a4f29aaa357ce Mon Sep 17 00:00:00 2001
From: Ananthakrishna Gopal
 <50374726+ananthakrishnagopal@users.noreply.github.com>
Date: Sat, 3 May 2025 00:48:12 +0530
Subject: [PATCH 04/11] Update README.md

---
 README.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/README.md b/README.md
index f9a3cdb..deac563 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,10 @@
+
+# Performance Upgrades To FABLE
+- Vectorized/Numba JITified Walsh Hadamard Transforms -> 100 - 400x speedups
+- Vectorized Gray Code Permutations -> upto 20-30x speedups
+
+
+
 # Fast Approximate BLock Encodings (FABLE)
 
 FABLE can synthesize quantum circuits for approximate block-encodings of matrices. A block-encoding is the embedding of a matrix in the leading block of of a larger unitary matrix.

From 050add3440bf69e437336a24d4bc76a37edd4c34 Mon Sep 17 00:00:00 2001
From: Ananthakrishna Gopal
 <50374726+ananthakrishnagopal@users.noreply.github.com>
Date: Sat, 3 May 2025 13:57:35 +0530
Subject: [PATCH 05/11] Update setup.cfg

added numba to pre-requisites.
---
 setup.cfg | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index 10ae32a..b399058 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -23,6 +23,7 @@ python_requires = >=3.6
 install_requires =
    qiskit>=0.19.1
    numpy>=1.20.3
+   numba
 
 [options.packages.find]
-where = py
\ No newline at end of file
+where = py

From 4b1c13ed2c8d99936a4644e8aa22deb26038b401 Mon Sep 17 00:00:00 2001
From: Ananthakrishna Gopal
 <50374726+ananthakrishnagopal@users.noreply.github.com>
Date: Sat, 3 May 2025 14:01:52 +0530
Subject: [PATCH 06/11] Update __init__.py

made faster_fable visible.
---
 py/fable/__init__.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/py/fable/__init__.py b/py/fable/__init__.py
index f9cf7cc..4a9049e 100644
--- a/py/fable/__init__.py
+++ b/py/fable/__init__.py
@@ -1,13 +1,15 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 """Fable"""
-from .fable import fable
+from .fable import fable,faster_fable
 
 __all__ = [
-    'fable'
+    'fable',
+    'faster_fable'
 ]
 
 
+
 __version__ = '1.0.1'
 __author__ = '''Daan Camps'''
 __maintainer__ = 'Daan Camps'

From 328019307a58ef64541bbb7a63e2de51a63e9c63 Mon Sep 17 00:00:00 2001
From: Ananthakrishna Gopal
 <50374726+ananthakrishnagopal@users.noreply.github.com>
Date: Sat, 3 May 2025 14:03:29 +0530
Subject: [PATCH 07/11] Update _util.py

fixed indentation.
---
 py/fable/_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/py/fable/_util.py b/py/fable/_util.py
index 29dc111..c8fd1ef 100644
--- a/py/fable/_util.py
+++ b/py/fable/_util.py
@@ -60,7 +60,7 @@ def sfwht(a):
 
 @njit
 def sfwht_numba(a):
-     """
+    """
     Numba Accelerated JIT SFWHT, can beat the vectorized version by almost 4x, and naive version by 400x
     
     Args:

From 9486d72f3371f25155e6eb4dec4dfdbaaae198cc Mon Sep 17 00:00:00 2001
From: Ananthakrishna Gopal
 <50374726+ananthakrishnagopal@users.noreply.github.com>
Date: Sat, 3 May 2025 20:22:40 +0530
Subject: [PATCH 08/11] Update fable.py

created a lookup table for compute controls.
---
 py/fable/fable.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/py/fable/fable.py b/py/fable/fable.py
index f581ebe..e3ca837 100644
--- a/py/fable/fable.py
+++ b/py/fable/fable.py
@@ -3,11 +3,12 @@
 import numpy as np
 from qiskit import QuantumCircuit
 from ._util import compressed_uniform_rotation, sfwht, gray_permutation,sfwht_numba,gray_permutation_vectorized
+from ._util import compressed_uniform_rotation_with_lut
 
 
 def faster_fable(a, epsilon=None):
     '''FABLE - Fast Approximate BLock Encodings.
-
+compressed_uniform_rotation_with_lut
     Args:
         a: array
             matrix to be block encoded.
@@ -20,7 +21,8 @@ def faster_fable(a, epsilon=None):
             subnormalization factor
     '''
     epsm = np.finfo(a.dtype).eps
-    alpha = np.linalg.norm(np.ravel(a), np.inf)
+    # alpha = np.linalg.norm(np.ravel(a), np.inf)
+    alpha = 1.0
     if alpha > 1:
         alpha = alpha + np.sqrt(epsm)
         a = a/alpha
@@ -49,7 +51,7 @@ def faster_fable(a, epsilon=None):
         if epsilon:
             a[abs(a) <= epsilon] = 0
         # compute circuit
-        OA = compressed_uniform_rotation(a)
+        OA = compressed_uniform_rotation_with_lut(a)
     else:  # complex data
         # magnitude
         a_m = gray_permutation_vectorized(
@@ -70,8 +72,8 @@ def faster_fable(a, epsilon=None):
             a_p[abs(a_p) <= epsilon] = 0
 
         # compute circuit
-        OA = compressed_uniform_rotation(a_m).compose(
-                compressed_uniform_rotation(a_p, ry=False)
+        OA = compressed_uniform_rotation_with_lut(a_m).compose(
+                compressed_uniform_rotation_with_lut(a_p, ry=False)
             )
 
     circ = QuantumCircuit(2*logn + 1)
@@ -112,7 +114,8 @@ def fable(a, epsilon=None):
             subnormalization factor
     '''
     epsm = np.finfo(a.dtype).eps
-    alpha = np.linalg.norm(np.ravel(a), np.inf)
+    # alpha = np.linalg.norm(np.ravel(a), np.inf)
+    alpha = 1.0
     if alpha > 1:
         alpha = alpha + np.sqrt(epsm)
         a = a/alpha

From dab6af096c99623a01c6ad17c2cb194842d721cd Mon Sep 17 00:00:00 2001
From: Ananthakrishna Gopal
 <50374726+ananthakrishnagopal@users.noreply.github.com>
Date: Sat, 3 May 2025 20:23:10 +0530
Subject: [PATCH 09/11] Update _util.py

added lookup table and optimized logic.
---
 py/fable/_util.py | 80 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)

diff --git a/py/fable/_util.py b/py/fable/_util.py
index c8fd1ef..f74bacc 100644
--- a/py/fable/_util.py
+++ b/py/fable/_util.py
@@ -155,3 +155,83 @@ def compressed_uniform_rotation(a, ry=True):
                 circ.cx(j, 0)
 
     return circ
+
+
+def count_trailing_ones_optimized(x):
+    if x == 0:
+        return 0
+    return ((x ^ (x + 1)) >> 1).bit_length()
+
+def compute_control_optimized(i, n, max_i):
+    if i == max_i:
+        return 1
+    trailing_ones = count_trailing_ones_optimized(i - 1)
+    return 2*n - trailing_ones
+
+def precompute_control_lut(n):
+    max_trailing_ones = 2*n
+    return [2*n - t for t in range(max_trailing_ones + 1)]
+
+from functools import lru_cache
+
+# @lru_cache(maxsize=None)
+def compressed_uniform_rotation_with_lut(a, ry=True):
+    """
+    Optimized compressed uniform rotation using a control LUT.
+
+    Args:
+        a (np.ndarray): Thresholded vector of dimension 2^(2n)
+        ry (bool): Apply RY if True, RZ otherwise
+
+    Returns:
+        QuantumCircuit: Qiskit circuit representing the rotation.
+    """
+    from qiskit import QuantumCircuit
+
+    n = int(np.log2(a.shape[0]) // 2)
+    max_i = a.shape[0]
+    circ = QuantumCircuit(2 * n + 1)
+
+    # Precompute control LUT (size O(n))
+    max_trailing_ones = 2 * n
+    control_lut = precompute_control_lut(n)
+
+    i = 0
+    while i < max_i:
+        parity_check = 0
+
+        # Apply rotation
+        if a[i] != 0:
+            if ry:
+                circ.ry(a[i], 0)
+            else:
+                circ.rz(a[i], 0)
+
+        # Skip zero blocks
+        while True:
+            # Special case: i+1 == max_i
+            if i + 1 == max_i:
+                ctrl = 1
+            else:
+                # Compute trailing_ones(i) and use LUT
+                trailing_ones = count_trailing_ones_optimized(i)
+                ctrl = control_lut[trailing_ones]
+
+            # Toggle control bit
+            parity_check ^= (1 << (ctrl - 1))
+            i += 1
+
+            # Break if non-zero or end of array
+            if i >= max_i or a[i] != 0:
+                break
+
+        # Add CNOT gates (only for set bits)
+        bit = 1
+        pos = 0
+        while bit <= parity_check:
+            if parity_check & bit:
+                circ.cx(pos + 1, 0)
+            bit <<= 1
+            pos += 1
+
+    return circ

From 59c91a35e16ee7b5b71ff9c10cd9e548f474e26c Mon Sep 17 00:00:00 2001
From: Ananthakrishna Gopal
 <50374726+ananthakrishnagopal@users.noreply.github.com>
Date: Sat, 3 May 2025 20:23:41 +0530
Subject: [PATCH 10/11] Update fable.py

---
 py/fable/fable.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/py/fable/fable.py b/py/fable/fable.py
index e3ca837..9f5ba1c 100644
--- a/py/fable/fable.py
+++ b/py/fable/fable.py
@@ -4,7 +4,7 @@
 from qiskit import QuantumCircuit
 from ._util import compressed_uniform_rotation, sfwht, gray_permutation,sfwht_numba,gray_permutation_vectorized
 from ._util import compressed_uniform_rotation_with_lut
-
+ 
 
 def faster_fable(a, epsilon=None):
     '''FABLE - Fast Approximate BLock Encodings.
@@ -21,8 +21,8 @@ def faster_fable(a, epsilon=None):
             subnormalization factor
     '''
     epsm = np.finfo(a.dtype).eps
-    # alpha = np.linalg.norm(np.ravel(a), np.inf)
-    alpha = 1.0
+    alpha = np.linalg.norm(np.ravel(a), np.inf)
+    # alpha = 1.0
     if alpha > 1:
         alpha = alpha + np.sqrt(epsm)
         a = a/alpha

From 62753a9b0c3977f92e178fac68b0a78f91e76836 Mon Sep 17 00:00:00 2001
From: Ananthakrishna Gopal
 <50374726+ananthakrishnagopal@users.noreply.github.com>
Date: Sat, 3 May 2025 20:24:40 +0530
Subject: [PATCH 11/11] Update README.md

---
 README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.md b/README.md
index deac563..4a0065a 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,9 @@
 # Performance Upgrades To FABLE
 - Vectorized/Numba JITified Walsh Hadamard Transforms -> 100 - 400x speedups
 - Vectorized Gray Code Permutations -> upto 20-30x speedups
+- Lookup tables for computing control qubits -> 50x speedups
+
+- Overall atleast 30% speedup on base version.