From 051a84b78ca35ae868954324c6a16f08e485f272 Mon Sep 17 00:00:00 2001 From: Ananthakrishna Gopal <50374726+ananthakrishnagopal@users.noreply.github.com> Date: Sat, 3 May 2025 00:26:28 +0530 Subject: [PATCH 01/11] Update _util.py Added Vectorized and Numba JIT Accelerated SFWHT and Gray Code Permutations. --- py/fable/_util.py | 55 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/py/fable/_util.py b/py/fable/_util.py index de39210..65033a0 100644 --- a/py/fable/_util.py +++ b/py/fable/_util.py @@ -30,6 +30,11 @@ def gray_permutation(a): b[i] = a[gray_code(i)] return b +def gray_permutation_vectorized(a): + """Fast Gray code permutation using NumPy vectorization.""" + indices = np.arange(a.shape[0]) + return a[indices ^ (indices >> 1)] + def sfwht(a): '''Scaled Fast Walsh-Hadamard transform of input vector a. @@ -51,7 +56,57 @@ def sfwht(a): a[j + 2**h] = (x - y) / 2 return a +from numba import njit + +@njit +def sfwht_numba(a): + """ + Numba Accelerated JIT SFWHT, can beat the vectorized version by almost 4x, and naive version by 400x + + Args: + a (np.ndarray): Input vector of size 2^n. + + Returns: + np.ndarray: Scaled Walsh-Hadamard transform of `a`. + """ + n = int(np.log2(a.shape[0])) + N = a.shape[0] + for h in range(n): + mh = 1 << (h + 1) + m = mh // 2 + for i in range(0, N, mh): + for j in range(i, i + m): + x = a[j] + y = a[j + m] + a[j] = (x + y) / 2 + a[j + m] = (x - y) / 2 + return a + +def sfwht_optimized_vectorized(a): + """ + Fully vectorized SFWHT using 2D reshaping to eliminate Python loops. Almost 50-100x faster than naive version + Args: + a (np.ndarray): Input vector of size 2^n. + + Returns: + np.ndarray: Scaled Walsh-Hadamard transform of `a`. + """ + n = int(np.log2(a.shape[0])) + N = a.shape[0] + a = a.copy().astype(np.float64) + + for h in range(n): + mh = 1 << (h + 1) + m = mh // 2 + a = a.reshape(-1, mh) + a[:, :m] = a[:, :m] + a[:, m:] + a[:, m:] = a[:, :m] - 2 * a[:, m:] + a = a.flatten() + + a /= N + return a + def compute_control(i, n): '''Compute the control qubit index based on the index i and size n.''' if i == 4**n: From 1cb5fabd06a046ee5ed2de3448d5ca07e878989c Mon Sep 17 00:00:00 2001 From: Ananthakrishna Gopal <50374726+ananthakrishnagopal@users.noreply.github.com> Date: Sat, 3 May 2025 00:30:54 +0530 Subject: [PATCH 02/11] Update fable.py added the vectorized gray permutation and sfwht numba --- py/fable/fable.py | 94 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 93 insertions(+), 1 deletion(-) diff --git a/py/fable/fable.py b/py/fable/fable.py index 80b3936..f581ebe 100644 --- a/py/fable/fable.py +++ b/py/fable/fable.py @@ -2,7 +2,99 @@ # -*- coding: utf-8 -*- import numpy as np from qiskit import QuantumCircuit -from ._util import compressed_uniform_rotation, sfwht, gray_permutation +from ._util import compressed_uniform_rotation, sfwht, gray_permutation,sfwht_numba,gray_permutation_vectorized + + +def faster_fable(a, epsilon=None): + '''FABLE - Fast Approximate BLock Encodings. + + Args: + a: array + matrix to be block encoded. + epsilon: float >= 0 + (optional) compression threshold. + Returns: + circuit: qiskit circuit + circuit that block encodes A + alpha: float + subnormalization factor + ''' + epsm = np.finfo(a.dtype).eps + alpha = np.linalg.norm(np.ravel(a), np.inf) + if alpha > 1: + alpha = alpha + np.sqrt(epsm) + a = a/alpha + else: + alpha = 1.0 + + n, m = a.shape + if n != m: + k = max(n, m) + a = np.pad(a, ((0, k - n), (0, k - m))) + n = k + logn = int(np.ceil(np.log2(n))) + if n < 2**logn: + a = np.pad(a, ((0, 2**logn - n), (0, 2**logn - n))) + n = 2**logn + + a = np.ravel(a) + + if all(np.abs(np.imag(a)) < epsm): # real data + a = gray_permutation_vectorized( + sfwht_numba( + 2.0 * np.arccos(np.real(a)) + ) + ) + # threshold the vector + if epsilon: + a[abs(a) <= epsilon] = 0 + # compute circuit + OA = compressed_uniform_rotation(a) + else: # complex data + # magnitude + a_m = gray_permutation_vectorized( + sfwht_numba( + 2.0 * np.arccos(np.abs(a)) + ) + ) + if epsilon: + a_m[abs(a_m) <= epsilon] = 0 + + # phase + a_p = gray_permutation_vectorized( + sfwht_numba( + -2.0 * np.angle(a) + ) + ) + if epsilon: + a_p[abs(a_p) <= epsilon] = 0 + + # compute circuit + OA = compressed_uniform_rotation(a_m).compose( + compressed_uniform_rotation(a_p, ry=False) + ) + + circ = QuantumCircuit(2*logn + 1) + + # diffusion on row indices + for i in range(logn): + circ.h(i+1) + + # matrix oracle + circ = circ.compose(OA) + + # swap register + for i in range(logn): + circ.swap(i+1, i+logn+1) + + # diffusion on row indices + for i in range(logn): + circ.h(i+1) + + # reverse bits because of little-endiannes + circ = circ.reverse_bits() + + return circ, alpha def fable(a, epsilon=None): From 4cfe4e994b121a9d7ff19054ed2ea323ab9ee8db Mon Sep 17 00:00:00 2001 From: Ananthakrishna Gopal <50374726+ananthakrishnagopal@users.noreply.github.com> Date: Sat, 3 May 2025 00:35:58 +0530 Subject: [PATCH 03/11] Update _util.py removed a leftover copy --- py/fable/_util.py | 1 - 1 file changed, 1 deletion(-) diff --git a/py/fable/_util.py b/py/fable/_util.py index 65033a0..29dc111 100644 --- a/py/fable/_util.py +++ b/py/fable/_util.py @@ -94,7 +94,6 @@ def sfwht_optimized_vectorized(a): """ n = int(np.log2(a.shape[0])) N = a.shape[0] - a = a.copy().astype(np.float64) for h in range(n): mh = 1 << (h + 1) From f63cf533a8081352ceb74bd8716a4f29aaa357ce Mon Sep 17 00:00:00 2001 From: Ananthakrishna Gopal <50374726+ananthakrishnagopal@users.noreply.github.com> Date: Sat, 3 May 2025 00:48:12 +0530 Subject: [PATCH 04/11] Update README.md --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index f9a3cdb..deac563 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,10 @@ + +# Performance Upgrades To FABLE +- Vectorized/Numba JITified Walsh Hadamard Transforms -> 100 - 400x speedups +- Vectorized Gray Code Permutations -> upto 20-30x speedups + + + # Fast Approximate BLock Encodings (FABLE) FABLE can synthesize quantum circuits for approximate block-encodings of matrices. A block-encoding is the embedding of a matrix in the leading block of of a larger unitary matrix. From 050add3440bf69e437336a24d4bc76a37edd4c34 Mon Sep 17 00:00:00 2001 From: Ananthakrishna Gopal <50374726+ananthakrishnagopal@users.noreply.github.com> Date: Sat, 3 May 2025 13:57:35 +0530 Subject: [PATCH 05/11] Update setup.cfg added numba to pre-requisites. --- setup.cfg | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 10ae32a..b399058 100644 --- a/setup.cfg +++ b/setup.cfg @@ -23,6 +23,7 @@ python_requires = >=3.6 install_requires = qiskit>=0.19.1 numpy>=1.20.3 + numba [options.packages.find] -where = py \ No newline at end of file +where = py From 4b1c13ed2c8d99936a4644e8aa22deb26038b401 Mon Sep 17 00:00:00 2001 From: Ananthakrishna Gopal <50374726+ananthakrishnagopal@users.noreply.github.com> Date: Sat, 3 May 2025 14:01:52 +0530 Subject: [PATCH 06/11] Update __init__.py made faster_fable visible. --- py/fable/__init__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/py/fable/__init__.py b/py/fable/__init__.py index f9cf7cc..4a9049e 100644 --- a/py/fable/__init__.py +++ b/py/fable/__init__.py @@ -1,13 +1,15 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- """Fable""" -from .fable import fable +from .fable import fable,faster_fable __all__ = [ - 'fable' + 'fable', + 'faster_fable' ] + __version__ = '1.0.1' __author__ = '''Daan Camps''' __maintainer__ = 'Daan Camps' From 328019307a58ef64541bbb7a63e2de51a63e9c63 Mon Sep 17 00:00:00 2001 From: Ananthakrishna Gopal <50374726+ananthakrishnagopal@users.noreply.github.com> Date: Sat, 3 May 2025 14:03:29 +0530 Subject: [PATCH 07/11] Update _util.py fixed indentation. --- py/fable/_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py/fable/_util.py b/py/fable/_util.py index 29dc111..c8fd1ef 100644 --- a/py/fable/_util.py +++ b/py/fable/_util.py @@ -60,7 +60,7 @@ def sfwht(a): @njit def sfwht_numba(a): - """ + """ Numba Accelerated JIT SFWHT, can beat the vectorized version by almost 4x, and naive version by 400x Args: From 9486d72f3371f25155e6eb4dec4dfdbaaae198cc Mon Sep 17 00:00:00 2001 From: Ananthakrishna Gopal <50374726+ananthakrishnagopal@users.noreply.github.com> Date: Sat, 3 May 2025 20:22:40 +0530 Subject: [PATCH 08/11] Update fable.py created a lookup table for compute controls. --- py/fable/fable.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/py/fable/fable.py b/py/fable/fable.py index f581ebe..e3ca837 100644 --- a/py/fable/fable.py +++ b/py/fable/fable.py @@ -3,11 +3,12 @@ import numpy as np from qiskit import QuantumCircuit from ._util import compressed_uniform_rotation, sfwht, gray_permutation,sfwht_numba,gray_permutation_vectorized +from ._util import compressed_uniform_rotation_with_lut def faster_fable(a, epsilon=None): '''FABLE - Fast Approximate BLock Encodings. - +compressed_uniform_rotation_with_lut Args: a: array matrix to be block encoded. @@ -20,7 +21,8 @@ def faster_fable(a, epsilon=None): subnormalization factor ''' epsm = np.finfo(a.dtype).eps - alpha = np.linalg.norm(np.ravel(a), np.inf) + # alpha = np.linalg.norm(np.ravel(a), np.inf) + alpha = 1.0 if alpha > 1: alpha = alpha + np.sqrt(epsm) a = a/alpha @@ -49,7 +51,7 @@ def faster_fable(a, epsilon=None): if epsilon: a[abs(a) <= epsilon] = 0 # compute circuit - OA = compressed_uniform_rotation(a) + OA = compressed_uniform_rotation_with_lut(a) else: # complex data # magnitude a_m = gray_permutation_vectorized( @@ -70,8 +72,8 @@ def faster_fable(a, epsilon=None): a_p[abs(a_p) <= epsilon] = 0 # compute circuit - OA = compressed_uniform_rotation(a_m).compose( - compressed_uniform_rotation(a_p, ry=False) + OA = compressed_uniform_rotation_with_lut(a_m).compose( + compressed_uniform_rotation_with_lut(a_p, ry=False) ) circ = QuantumCircuit(2*logn + 1) @@ -112,7 +114,8 @@ def fable(a, epsilon=None): subnormalization factor ''' epsm = np.finfo(a.dtype).eps - alpha = np.linalg.norm(np.ravel(a), np.inf) + # alpha = np.linalg.norm(np.ravel(a), np.inf) + alpha = 1.0 if alpha > 1: alpha = alpha + np.sqrt(epsm) a = a/alpha From dab6af096c99623a01c6ad17c2cb194842d721cd Mon Sep 17 00:00:00 2001 From: Ananthakrishna Gopal <50374726+ananthakrishnagopal@users.noreply.github.com> Date: Sat, 3 May 2025 20:23:10 +0530 Subject: [PATCH 09/11] Update _util.py added lookup table and optimized logic. --- py/fable/_util.py | 80 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/py/fable/_util.py b/py/fable/_util.py index c8fd1ef..f74bacc 100644 --- a/py/fable/_util.py +++ b/py/fable/_util.py @@ -155,3 +155,83 @@ def compressed_uniform_rotation(a, ry=True): circ.cx(j, 0) return circ + + +def count_trailing_ones_optimized(x): + if x == 0: + return 0 + return ((x ^ (x + 1)) >> 1).bit_length() + +def compute_control_optimized(i, n, max_i): + if i == max_i: + return 1 + trailing_ones = count_trailing_ones_optimized(i - 1) + return 2*n - trailing_ones + +def precompute_control_lut(n): + max_trailing_ones = 2*n + return [2*n - t for t in range(max_trailing_ones + 1)] + +from functools import lru_cache + +# @lru_cache(maxsize=None) +def compressed_uniform_rotation_with_lut(a, ry=True): + """ + Optimized compressed uniform rotation using a control LUT. + + Args: + a (np.ndarray): Thresholded vector of dimension 2^(2n) + ry (bool): Apply RY if True, RZ otherwise + + Returns: + QuantumCircuit: Qiskit circuit representing the rotation. + """ + from qiskit import QuantumCircuit + + n = int(np.log2(a.shape[0]) // 2) + max_i = a.shape[0] + circ = QuantumCircuit(2 * n + 1) + + # Precompute control LUT (size O(n)) + max_trailing_ones = 2 * n + control_lut = precompute_control_lut(n) + + i = 0 + while i < max_i: + parity_check = 0 + + # Apply rotation + if a[i] != 0: + if ry: + circ.ry(a[i], 0) + else: + circ.rz(a[i], 0) + + # Skip zero blocks + while True: + # Special case: i+1 == max_i + if i + 1 == max_i: + ctrl = 1 + else: + # Compute trailing_ones(i) and use LUT + trailing_ones = count_trailing_ones_optimized(i) + ctrl = control_lut[trailing_ones] + + # Toggle control bit + parity_check ^= (1 << (ctrl - 1)) + i += 1 + + # Break if non-zero or end of array + if i >= max_i or a[i] != 0: + break + + # Add CNOT gates (only for set bits) + bit = 1 + pos = 0 + while bit <= parity_check: + if parity_check & bit: + circ.cx(pos + 1, 0) + bit <<= 1 + pos += 1 + + return circ From 59c91a35e16ee7b5b71ff9c10cd9e548f474e26c Mon Sep 17 00:00:00 2001 From: Ananthakrishna Gopal <50374726+ananthakrishnagopal@users.noreply.github.com> Date: Sat, 3 May 2025 20:23:41 +0530 Subject: [PATCH 10/11] Update fable.py --- py/fable/fable.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/py/fable/fable.py b/py/fable/fable.py index e3ca837..9f5ba1c 100644 --- a/py/fable/fable.py +++ b/py/fable/fable.py @@ -4,7 +4,7 @@ from qiskit import QuantumCircuit from ._util import compressed_uniform_rotation, sfwht, gray_permutation,sfwht_numba,gray_permutation_vectorized from ._util import compressed_uniform_rotation_with_lut - + def faster_fable(a, epsilon=None): '''FABLE - Fast Approximate BLock Encodings. @@ -21,8 +21,8 @@ def faster_fable(a, epsilon=None): subnormalization factor ''' epsm = np.finfo(a.dtype).eps - # alpha = np.linalg.norm(np.ravel(a), np.inf) - alpha = 1.0 + alpha = np.linalg.norm(np.ravel(a), np.inf) + # alpha = 1.0 if alpha > 1: alpha = alpha + np.sqrt(epsm) a = a/alpha From 62753a9b0c3977f92e178fac68b0a78f91e76836 Mon Sep 17 00:00:00 2001 From: Ananthakrishna Gopal <50374726+ananthakrishnagopal@users.noreply.github.com> Date: Sat, 3 May 2025 20:24:40 +0530 Subject: [PATCH 11/11] Update README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index deac563..4a0065a 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,9 @@ # Performance Upgrades To FABLE - Vectorized/Numba JITified Walsh Hadamard Transforms -> 100 - 400x speedups - Vectorized Gray Code Permutations -> upto 20-30x speedups +- Lookup tables for computing control qubits -> 50x speedups + +- Overall atleast 30% speedup on base version.