-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBloomFilter.h
144 lines (120 loc) · 3.44 KB
/
BloomFilter.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#include <cstddef>
#include <string>
#include <cstdint>
#include <bitset>
#include <stdexcept>
#include "city.h"
#include "MurmurHash3.h"
#include <iostream>
#include <nthash/nthash.hpp>
template<typename T>
class HashFunction {
public:
virtual size_t operator()(const T& data, size_t length) const = 0;
};
template<typename T>
class MurmurHash3 : public HashFunction<T> {
public:
size_t operator()(const T& data, size_t length) const override{
uint32_t hashValue;
const void* keyPtr = static_cast<const void*>(data.c_str());
MurmurHash3_x86_32(keyPtr, length, 0, &hashValue);
return hashValue;
}
};
template<typename T>
class CityHash : public HashFunction<T> {
public:
size_t operator()(const T& data, size_t length) const override{
return CityHash64(data.c_str(), length);
}
};
template<typename T>
class NtHashFunction : public HashFunction<T> {
public:
NtHashFunction(size_t hashes, size_t k) :
num_hashes(hashes),
k(k)
{}
size_t operator()(const T& data, size_t length) const override {
nthash::NtHash nth(data, num_hashes, k);
while(nth.roll()){}
return nth.get_forward_hash();
}
private:
size_t num_hashes;
size_t k;
};
template <typename T>
struct HashFunctionTraits {
using ConstructorType = void;
};
template <>
struct HashFunctionTraits<NtHashFunction<std::string>> {
using ConstructorType = NtHashFunction<std::string>;
};
template <>
struct HashFunctionTraits<MurmurHash3<std::string>> {
using ConstructorType = MurmurHash3<std::string>;
};
template <>
struct HashFunctionTraits<CityHash<std::string>> {
using ConstructorType = CityHash<std::string>;
};
// Defining a generic HashFunction type and a generic Size value for bitset instantiation.
// You can do it differently if you prefer, for example by using a boolean vector and resizing with a constructor argument.
// A bitset should have less memory overhead (in theory)
template <typename HashFunc>
class Bloomfilter
{
public:
Bloomfilter(size_t filter_size = default_size) :
bloomfilter_store(filter_size),
object_count_(0),
hashFunc()
{
using ConstructorType = typename HashFunctionTraits<HashFunc>::ConstructorType;
hashFunc = ConstructorType();
}
Bloomfilter(size_t filter_size, size_t hashes, size_t k) :
bloomfilter_store(filter_size),
object_count_(0),
hashFunc(hashes, k)
{
using ConstructorType = typename HashFunctionTraits<HashFunc>::ConstructorType;
hashFunc = ConstructorType(hashes, k);
}
void insert(const std::string& object)
{
size_t hash = hashFunc(object, object.size()) % bloomfilter_store.size();
bloomfilter_store[hash] = true;
++object_count_;
}
void clear()
{
bloomfilter_store.clear();
object_count_ = 0; // Reset the object count
}
bool contains(const std::string& object) const
{
size_t hash = hashFunc(object, object.size()) % bloomfilter_store.size();
return bloomfilter_store[hash];
}
size_t object_count() const
{
return object_count_;
}
bool empty() const
{
return 0 == object_count();
}
size_t size() const
{
return bloomfilter_store.size();
}
private:
static constexpr size_t default_size = 33554432;
std::vector<bool> bloomfilter_store;
size_t object_count_;
HashFunc hashFunc;
};