Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Quotient Filter: Get Hashes and Resize / Auto Expand #115

Merged
merged 7 commits into from
Jan 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 2 additions & 14 deletions probables/blooms/bloom.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,21 +315,9 @@ def export_c_header(self, filename: Union[str, Path]) -> None:
with open(filename, "w", encoding="utf-8") as file:
print(f"/* BloomFilter Export of a {bloom_type} */", file=file)
print("#include <inttypes.h>", file=file)
print(
"const uint64_t estimated_elements = ",
self.estimated_elements,
";",
sep="",
file=file,
)
print("const uint64_t estimated_elements = ", self.estimated_elements, ";", sep="", file=file)
print("const uint64_t elements_added = ", self.elements_added, ";", sep="", file=file)
print(
"const float false_positive_rate = ",
self.false_positive_rate,
";",
sep="",
file=file,
)
print("const float false_positive_rate = ", self.false_positive_rate, ";", sep="", file=file)
print("const uint64_t number_bits = ", self.number_bits, ";", sep="", file=file)
print("const unsigned int number_hashes = ", self.number_hashes, ";", sep="", file=file)
print("const unsigned char bloom[] = {", *data, "};", sep="\n", file=file)
Expand Down
164 changes: 149 additions & 15 deletions probables/quotientfilter/quotientfilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"""

from array import array
from typing import Optional
from typing import Iterator, List, Optional

from probables.hashes import KeyT, SimpleHashT, fnv_1a_32
from probables.utilities import Bitarray
Expand All @@ -15,6 +15,7 @@ class QuotientFilter:

Args:
quotient (int): The size of the quotient to use
auto_expand (bool): Automatically expand or not
hash_function (function): Hashing strategy function to use `hf(key, number)`
Returns:
QuotientFilter: The initialized filter
Expand All @@ -35,18 +36,27 @@ class QuotientFilter:
"_is_continuation",
"_is_shifted",
"_filter",
"_max_load_factor",
"_auto_resize",
)

def __init__(self, quotient: int = 20, hash_function: Optional[SimpleHashT] = None): # needs to be parameterized
def __init__(
self, quotient: int = 20, auto_expand: bool = True, hash_function: Optional[SimpleHashT] = None
): # needs to be parameterized
if quotient < 3 or quotient > 31:
raise ValueError(
f"Quotient filter: Invalid quotient setting; quotient must be between 3 and 31; {quotient} was provided"
)
self._q = quotient
self._r = 32 - quotient
self._size = 1 << self._q # same as 2**q
self._elements_added = 0
self.__set_params(quotient, auto_expand, hash_function)

def __set_params(self, quotient: int, auto_expand: bool, hash_function: Optional[SimpleHashT]):
self._q: int = quotient
self._r: int = 32 - quotient
self._size: int = 1 << self._q # same as 2**q
self._elements_added: int = 0
self._auto_resize: bool = auto_expand
self._hash_func: SimpleHashT = fnv_1a_32 if hash_function is None else hash_function # type: ignore
self._max_load_factor: float = 0.85

# ensure we use the smallest type possible to reduce memory wastage
if self._r <= 8:
Expand Down Expand Up @@ -89,21 +99,61 @@ def elements_added(self) -> int:
return self._elements_added

@property
def bits_per_elm(self):
def bits_per_elm(self) -> int:
"""int: The number of bits used per element"""
return self._bits_per_elm

@property
def size(self) -> int:
"""int: The number of bins available in the filter

Note:
same as `num_elements`"""
return self._size

@property
def load_factor(self) -> float:
"""float: The load factor of the filter"""
return self._elements_added / self._size

@property
def auto_expand(self) -> bool:
"""bool: Will the quotient filter automatically expand"""
return self._auto_resize

@auto_expand.setter
def auto_expand(self, val: bool):
"""change the auto expand property"""
self._auto_resize = bool(val)

@property
def max_load_factor(self) -> float:
"""float: The maximum allowed load factor after which auto expanding should occur"""
return self._max_load_factor

@max_load_factor.setter
def max_load_factor(self, val: float):
"""set the maximum load factor"""
self._max_load_factor = float(val)

def add(self, key: KeyT) -> None:
"""Add key to the quotient filter

Args:
key (str|bytes): The element to add"""
_hash = self._hash_func(key, 0)
self.add_alt(_hash)

def add_alt(self, _hash: int) -> None:
"""Add the pre-hashed value to the quotient filter

Args:
_hash (int): The element to add"""
key_quotient = _hash >> self._r
key_remainder = _hash & ((1 << self._r) - 1)

if not self._contains(key_quotient, key_remainder):
# TODO, add it here
if self._contained_at_loc(key_quotient, key_remainder) == -1:
if self._auto_resize and self.load_factor >= self._max_load_factor:
self.resize()
self._add(key_quotient, key_remainder)

def check(self, key: KeyT) -> bool:
Expand All @@ -114,9 +164,92 @@ def check(self, key: KeyT) -> bool:
Return:
bool: True if likely encountered, False if definately not"""
_hash = self._hash_func(key, 0)
return self.check_alt(_hash)

def check_alt(self, _hash: int) -> bool:
"""Check to see if the pre-calculated hash is likely in the quotient filter

Args:
_hash (int): The element to add
Return:
bool: True if likely encountered, False if definately not"""
key_quotient = _hash >> self._r
key_remainder = _hash & ((1 << self._r) - 1)
return self._contains(key_quotient, key_remainder)
return not self._contained_at_loc(key_quotient, key_remainder) == -1

def iter_hashes(self) -> Iterator[int]:
"""A generator over the hashes in the quotient filter

Yields:
int: The next hash stored in the quotient filter"""
queue: List[int] = []

# find first empty location
start = 0
while True:
is_occupied = self._is_occupied.check_bit(start)
is_continuation = self._is_continuation.check_bit(start)
is_shifted = self._is_shifted.check_bit(start)
if is_occupied + is_continuation + is_shifted == 0:
break
start += 1

cur_quot = 0
for i in range(start, self._size + start): # this will allow for wrap-arounds
idx = i % self._size
is_occupied = self._is_occupied.check_bit(idx)
is_continuation = self._is_continuation.check_bit(idx)
is_shifted = self._is_shifted.check_bit(idx)
# Nothing here, keep going
if is_occupied + is_continuation + is_shifted == 0:
assert len(queue) == 0
continue

if is_occupied == 1: # keep track of the indicies that match a hashed quotient
queue.append(idx)

# run start
if not is_continuation and (is_occupied or is_shifted):
cur_quot = queue.pop(0)

if self._filter[idx] != 0:
yield (cur_quot << self._r) + self._filter[idx]

def get_hashes(self) -> List[int]:
"""Get the hashes from the quotient filter as a list

Returns:
list(int): The hash values stored in the quotient filter"""
return list(self.iter_hashes())

def resize(self, quotient: Optional[int] = None) -> None:
"""Resize the quotient filter to use the new quotient size

Args:
int: The new quotient to use
Note:
If `None` is provided, the quotient filter will double in size (quotient + 1)
Raises:
ValueError: When the new quotient will not accommodate the elements already added"""
if quotient is None:
quotient = self._q + 1

if self.elements_added >= (1 << quotient):
raise ValueError("Unable to shrink since there will be too many elements in the quotient filter")
if quotient < 3 or quotient > 31:
raise ValueError(
f"Quotient filter: Invalid quotient setting; quotient must be between 3 and 31; {quotient} was provided"
)

hashes = self.get_hashes()

for i in range(self._size):
self._filter[i] = 0

self.__set_params(quotient, self._auto_resize, self._hash_func)

for _h in hashes:
self.add_alt(_h)

def _shift_insert(self, k, v, start, j, flag):
if self._is_occupied[j] == 0 and self._is_continuation[j] == 0 and self._is_shifted[j] == 0:
Expand Down Expand Up @@ -215,9 +348,10 @@ def _add(self, q: int, r: int):
self._shift_insert(q, r, orig_start_idx, start_idx, 1)
self._elements_added += 1

def _contains(self, q: int, r: int) -> bool:
def _contained_at_loc(self, q: int, r: int) -> int:
"""returns the index location of the element, or -1 if not present"""
if self._is_occupied[q] == 0:
return False
return -1

start_idx = self._get_start_index(q)

Expand All @@ -236,7 +370,7 @@ def _contains(self, q: int, r: int) -> bool:
break

if self._filter[start_idx] == r:
return True
return start_idx

start_idx = (start_idx + 1) & (self._size - 1)
meta_bits = (
Expand All @@ -245,4 +379,4 @@ def _contains(self, q: int, r: int) -> bool:
+ self._is_shifted.check_bit(start_idx)
)

return False
return -1
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ keywords = [
"bloom-filter",
"count-min-sketch",
"cuckoo-filter",
"quotient-filter",
]
readme = "README.rst"
classifiers = [
Expand Down
Loading