Skip to content

Commit ffe1039

Browse files
committed
Add reset method to bloom filter bindings
- Add reset() method to Python bloom filter wrapper - Method clears all bits and resets filter to empty state - Preserves filter configuration (capacity, num_hashes, seed) - Add comprehensive unit test for reset functionality - Test verifies empty state, bit usage reset, and post-reset operations
1 parent f074b98 commit ffe1039

2 files changed

Lines changed: 271 additions & 28 deletions

File tree

src/bloom_filter_wrapper.cpp

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ void bind_bloom_filter(nb::module_ &m, const char* name) {
4040
.def("query", static_cast<bool (bloom_filter_type::*)(const std::string&) const>(&bloom_filter_type::query),
4141
nb::arg("item"),
4242
"Queries the filter for the given string")
43+
.def("reset", &bloom_filter_type::reset,
44+
"Resets the Bloom filter to its original empty state")
4345
.def("get_serialized_size_bytes",
4446
[](const bloom_filter_type& sk) { return sk.get_serialized_size_bytes(); },
4547
"Returns the size in bytes of the serialized image of the filter")
@@ -87,7 +89,27 @@ void bind_bloom_filter(nb::module_ &m, const char* name) {
8789
":param num_bits: Size of the Bloom filter in bits\n:type num_bits: int\n"
8890
":param num_hashes: Number of hash functions to apply to items\n:type num_hashes: int\n"
8991
":param seed: Hash seed to use (default: random)\n:type seed: int, optional"
90-
);
92+
)
93+
.def_prop_ro(
94+
"num_bits_used",
95+
&bloom_filter_type::get_bits_used,
96+
"Number of bits set to 1 in the Bloom filter"
97+
)
98+
.def_prop_ro(
99+
"capacity",
100+
&bloom_filter_type::get_capacity,
101+
"Number of bits in the Bloom filter's bit array"
102+
)
103+
.def_prop_ro(
104+
"num_hashes",
105+
&bloom_filter_type::get_num_hashes,
106+
"Number of hash functions used by this Bloom filter"
107+
)
108+
.def_prop_ro(
109+
"seed",
110+
&bloom_filter_type::get_seed,
111+
"Hash seed used by this Bloom filter"
112+
);
91113
}
92114

93115
void init_bloom_filter(nb::module_ &m) {

tests/bloom_filter_test.py

Lines changed: 248 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -19,96 +19,317 @@
1919
from datasketches import bloom_filter
2020

2121
class BloomFilterTest(unittest.TestCase):
22+
def test_standard_constructors(self):
23+
"""Test standard constructors with exact validation."""
24+
num_items = 4000
25+
fpp = 0.01
26+
27+
num_bits = bloom_filter.suggest_num_filter_bits(num_items, fpp)
28+
num_hashes = bloom_filter.suggest_num_hashes(num_items, num_bits)
29+
seed = 89023
30+
31+
bf = bloom_filter.create_by_size(num_bits, num_hashes, seed)
32+
# C++ rounds up to nearest multiple of 64
33+
adjusted_num_bits = (num_bits + 63) & ~0x3F
34+
self.assertEqual(bf.capacity, adjusted_num_bits)
35+
self.assertEqual(bf.num_hashes, num_hashes)
36+
self.assertEqual(bf.seed, seed)
37+
self.assertTrue(bf.is_empty())
38+
39+
# Should match above
40+
bf2 = bloom_filter.create_by_accuracy(num_items, fpp, seed)
41+
self.assertEqual(bf2.capacity, adjusted_num_bits)
42+
self.assertEqual(bf2.num_hashes, num_hashes)
43+
self.assertEqual(bf2.seed, seed)
44+
self.assertTrue(bf2.is_empty())
45+
46+
def test_basic_operations(self):
47+
"""Test basic operations with validation."""
48+
num_items = 5000
49+
fpp = 0.01
50+
seed = 4897301548054
51+
52+
bf = bloom_filter.create_by_accuracy(num_items, fpp, seed)
53+
self.assertTrue(bf.is_empty())
54+
self.assertEqual(bf.num_bits_used, 0)
55+
56+
# Add items
57+
for i in range(num_items):
58+
bf.update(str(i))
59+
60+
self.assertFalse(bf.is_empty())
61+
self.assertGreater(bf.num_bits_used, 0)
62+
self.assertLessEqual(bf.num_bits_used, bf.capacity)
63+
64+
# Count false positives
65+
false_positives = 0
66+
for i in range(num_items, min(num_items + 1000, bf.capacity)):
67+
if bf.query(str(i)):
68+
false_positives += 1
69+
70+
self.assertGreater(false_positives, 0)
71+
self.assertLess(false_positives, 100)
72+
73+
# Test serialization
74+
bf_bytes = bf.serialize()
75+
self.assertEqual(bf.get_serialized_size_bytes(), len(bf_bytes))
76+
77+
new_bf = bloom_filter.deserialize(bf_bytes)
78+
self.assertEqual(bf.capacity, new_bf.capacity)
79+
self.assertEqual(bf.num_hashes, new_bf.num_hashes)
80+
self.assertEqual(bf.seed, new_bf.seed)
81+
self.assertEqual(bf.num_bits_used, new_bf.num_bits_used)
82+
83+
# Verify all original items are still found
84+
for i in range(num_items):
85+
self.assertTrue(new_bf.query(str(i)))
86+
87+
def test_reset_method(self):
88+
"""Test the reset method functionality."""
89+
bf = bloom_filter.create_by_accuracy(1000, 0.01)
90+
91+
# Initially empty
92+
self.assertTrue(bf.is_empty())
93+
self.assertEqual(bf.num_bits_used, 0)
94+
95+
# Add some items
96+
test_items = ["item1", "item2", "item3", "item4", "item5"]
97+
for item in test_items:
98+
bf.update(item)
99+
100+
# Verify items were added
101+
self.assertFalse(bf.is_empty())
102+
self.assertGreater(bf.num_bits_used, 0)
103+
104+
for item in test_items:
105+
self.assertTrue(bf.query(item))
106+
107+
# Reset the filter
108+
bf.reset()
109+
110+
# Verify filter is back to empty state
111+
self.assertTrue(bf.is_empty())
112+
self.assertEqual(bf.num_bits_used, 0)
113+
114+
# Verify none of the original items are found
115+
for item in test_items:
116+
self.assertFalse(bf.query(item))
117+
118+
# Verify properties are preserved
119+
self.assertGreater(bf.capacity, 0)
120+
self.assertGreater(bf.num_hashes, 0)
121+
self.assertIsInstance(bf.seed, int)
122+
123+
# Can add new items after reset
124+
bf.update("new_item")
125+
self.assertFalse(bf.is_empty())
126+
self.assertTrue(bf.query("new_item"))
127+
22128
def test_bloom_filter_accuracy_constructor(self):
23-
# Test the accuracy-based constructor (max_distinct_items, target_false_positive_prob, seed)
129+
"""Test the accuracy-based constructor."""
24130
max_distinct_items = 1000
25131
target_false_positive_prob = 0.01
26132

27-
# Create bloom filter using accuracy parameters
28133
bf = bloom_filter.create_by_accuracy(max_distinct_items, target_false_positive_prob)
29134
self.assertTrue(bf.is_empty())
30135

31-
# Add some items
32136
test_items = ["item1", "item2", "item3", "item4", "item5"]
33137
for item in test_items:
34138
bf.update(item)
35139

36140
self.assertFalse(bf.is_empty())
37141

38-
# Query items that were added
39142
for item in test_items:
40143
self.assertTrue(bf.query(item))
41-
42-
# Query items that were not added (should mostly return False, but may have false positives)
43-
non_existent_items = ["not_item1", "not_item2", "not_item3"]
44-
for item in non_existent_items:
45-
# We can't assert False here due to false positive possibility
46-
# Just verify the method works
47-
result = bf.query(item)
48-
self.assertIsInstance(result, bool)
49144

50145
def test_bloom_filter_size_constructor(self):
51-
# Test the size-based constructor (num_bits, num_hashes, seed)
52-
num_bits = 8192 # 8KB in bits
146+
"""Test the size-based constructor."""
147+
num_bits = 8192
53148
num_hashes = 5
54149

55-
# Create bloom filter using size parameters
56150
bf = bloom_filter.create_by_size(num_bits, num_hashes)
57151
self.assertTrue(bf.is_empty())
152+
self.assertEqual(bf.capacity, num_bits)
153+
self.assertEqual(bf.num_hashes, num_hashes)
58154

59-
# Add some items
60155
test_items = ["item1", "item2", "item3"]
61156
for item in test_items:
62157
bf.update(item)
63158

64159
self.assertFalse(bf.is_empty())
65160

66-
# Query items that were added
67161
for item in test_items:
68162
self.assertTrue(bf.query(item))
69163

70164
def test_bloom_filter_static_methods(self):
71-
# Test the static helper methods
165+
"""Test the static helper methods."""
72166
max_distinct_items = 1000
73167
target_false_positive_prob = 0.01
74168

75-
# Test suggest_num_hashes_by_probability
76169
num_hashes = bloom_filter.suggest_num_hashes_by_probability(target_false_positive_prob)
77170
self.assertIsInstance(num_hashes, int)
78171
self.assertGreater(num_hashes, 0)
79172

80-
# Test suggest_num_filter_bits
81173
num_bits = bloom_filter.suggest_num_filter_bits(max_distinct_items, target_false_positive_prob)
82174
self.assertIsInstance(num_bits, int)
83175
self.assertGreater(num_bits, 0)
84176

85-
# Test suggest_num_hashes with both parameters
86177
num_hashes_alt = bloom_filter.suggest_num_hashes(max_distinct_items, num_bits)
87178
self.assertIsInstance(num_hashes_alt, int)
88179
self.assertGreater(num_hashes_alt, 0)
89180

90181
def test_bloom_filter_serialization(self):
91-
# Test serialization and deserialization
182+
"""Test serialization and deserialization."""
92183
bf = bloom_filter.create_by_accuracy(1000, 0.01)
93184

94-
# Add some items
95185
test_items = ["item1", "item2", "item3"]
96186
for item in test_items:
97187
bf.update(item)
98188

99-
# Serialize
100189
bf_bytes = bf.serialize()
101190
self.assertEqual(bf.get_serialized_size_bytes(), len(bf_bytes))
102191

103-
# Deserialize
104192
new_bf = bloom_filter.deserialize(bf_bytes)
105193

106-
# Verify the deserialized filter has the same behavior
107194
for item in test_items:
108195
self.assertTrue(new_bf.query(item))
109196

110-
# Verify it's not empty
111197
self.assertFalse(new_bf.is_empty())
198+
self.assertEqual(bf.capacity, new_bf.capacity)
199+
self.assertEqual(bf.num_hashes, new_bf.num_hashes)
200+
self.assertEqual(bf.seed, new_bf.seed)
201+
self.assertEqual(bf.num_bits_used, new_bf.num_bits_used)
202+
203+
def test_empty_serialization(self):
204+
"""Test serialization of empty bloom filter."""
205+
bf = bloom_filter.create_by_accuracy(1000, 0.01)
206+
self.assertTrue(bf.is_empty())
207+
208+
bf_bytes = bf.serialize()
209+
self.assertEqual(bf.get_serialized_size_bytes(), len(bf_bytes))
210+
211+
new_bf = bloom_filter.deserialize(bf_bytes)
212+
self.assertTrue(new_bf.is_empty())
213+
self.assertEqual(bf.capacity, new_bf.capacity)
214+
self.assertEqual(bf.num_hashes, new_bf.num_hashes)
215+
self.assertEqual(bf.seed, new_bf.seed)
216+
217+
def test_bits_used_properties(self):
218+
"""Test that bits_used behaves correctly."""
219+
bf = bloom_filter.create_by_accuracy(1000, 0.01)
220+
221+
self.assertEqual(bf.num_bits_used, 0)
222+
223+
bf.update("alpha")
224+
bits1 = bf.num_bits_used
225+
self.assertIsInstance(bits1, int)
226+
self.assertGreater(bits1, 0)
227+
228+
# Idempotent
229+
bits1_again = bf.num_bits_used
230+
self.assertEqual(bits1_again, bits1)
231+
232+
# Same item shouldn't change bits_used
233+
bf.update("alpha")
234+
bits_dup = bf.num_bits_used
235+
self.assertEqual(bits_dup, bits1)
236+
237+
# Additional items should be non-decreasing
238+
for s in ["beta", "gamma", "delta"]:
239+
bf.update(s)
240+
new_bits = bf.num_bits_used
241+
self.assertGreaterEqual(new_bits, bits1)
242+
bits1 = new_bits
243+
244+
def test_capacity_properties(self):
245+
"""Test that capacity is positive and constant."""
246+
bf = bloom_filter.create_by_accuracy(1000, 0.01)
247+
cap1 = bf.capacity
248+
self.assertIsInstance(cap1, int)
249+
self.assertGreater(cap1, 0)
250+
251+
cap2 = bf.capacity
252+
self.assertEqual(cap1, cap2)
253+
254+
bf.update("alpha")
255+
bf.update("beta")
256+
self.assertEqual(cap1, bf.capacity)
257+
258+
def test_num_hashes_properties(self):
259+
"""Test that num_hashes is consistent."""
260+
bf = bloom_filter.create_by_accuracy(1000, 0.01)
261+
k1 = bf.num_hashes
262+
self.assertIsInstance(k1, int)
263+
self.assertGreaterEqual(k1, 1)
264+
bf.update("alpha")
265+
self.assertEqual(k1, bf.num_hashes)
266+
267+
bf2 = bloom_filter.create_by_size(10000, 3)
268+
k2 = bf2.num_hashes
269+
self.assertIsInstance(k2, int)
270+
self.assertEqual(k2, 3)
271+
bf2.update("beta")
272+
self.assertEqual(k2, bf2.num_hashes)
273+
274+
def test_seed_properties(self):
275+
"""Test that seed is consistent."""
276+
bf = bloom_filter.create_by_accuracy(1000, 0.01)
277+
s1 = bf.seed
278+
self.assertIsInstance(s1, int)
279+
280+
bf.update("alpha")
281+
self.assertEqual(s1, bf.seed)
282+
283+
explicit_seed = 12345
284+
bf2 = bloom_filter.create_by_accuracy(1000, 0.01, explicit_seed)
285+
s2 = bf2.seed
286+
self.assertIsInstance(s2, int)
287+
self.assertEqual(s2, explicit_seed)
288+
289+
def test_deterministic_behavior(self):
290+
"""Test that bloom filters with the same seed behave deterministically."""
291+
seed = 12345
292+
max_distinct_items = 1000
293+
target_fpp = 0.01
294+
295+
bf1 = bloom_filter.create_by_accuracy(max_distinct_items, target_fpp, seed)
296+
bf2 = bloom_filter.create_by_accuracy(max_distinct_items, target_fpp, seed)
297+
298+
test_items = [f"item_{i}" for i in range(100)]
299+
for item in test_items:
300+
bf1.update(item)
301+
bf2.update(item)
302+
303+
self.assertEqual(bf1.capacity, bf2.capacity)
304+
self.assertEqual(bf1.num_hashes, bf2.num_hashes)
305+
self.assertEqual(bf1.seed, bf2.seed)
306+
self.assertEqual(bf1.num_bits_used, bf2.num_bits_used)
307+
308+
for item in test_items:
309+
self.assertEqual(bf1.query(item), bf2.query(item))
310+
311+
def test_edge_cases(self):
312+
"""Test edge cases and boundary conditions."""
313+
bf_small = bloom_filter.create_by_size(64, 1)
314+
self.assertEqual(bf_small.capacity, 64)
315+
self.assertEqual(bf_small.num_hashes, 1)
316+
317+
bf_many_hashes = bloom_filter.create_by_size(1000, 20)
318+
self.assertEqual(bf_many_hashes.num_hashes, 20)
319+
320+
bf_low_fpp = bloom_filter.create_by_accuracy(100, 1e-6)
321+
self.assertGreater(bf_low_fpp.num_hashes, 1)
322+
323+
def test_mathematical_properties(self):
324+
"""Test mathematical properties of bloom filters."""
325+
bf = bloom_filter.create_by_size(100, 5)
326+
self.assertEqual(bf.capacity % 64, 0)
327+
328+
bf.update("test_item")
329+
self.assertLessEqual(bf.num_bits_used, bf.capacity)
330+
331+
self.assertGreater(bf.num_hashes, 0)
332+
self.assertIsInstance(bf.seed, int)
112333

113334
if __name__ == '__main__':
114335
unittest.main()

0 commit comments

Comments
 (0)