Skip to content

Commit f074b98

Browse files
committed
Refactor bloom filter API to use static factory methods instead of standalone function
- Replace create_bloom_filter() function with bloom_filter.create_by_accuracy() and bloom_filter.create_by_size() static methods - Follow repository vernacular pattern similar to other sketches - Update tests to use new API - Fix indentation issues in test file - All tests passing (4/4 bloom filter tests, 37/37 other tests)
1 parent b118463 commit f074b98

2 files changed

Lines changed: 123 additions & 63 deletions

File tree

src/bloom_filter_wrapper.cpp

Lines changed: 34 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -30,15 +30,7 @@ void bind_bloom_filter(nb::module_ &m, const char* name) {
3030
using namespace datasketches;
3131
using bloom_filter_type = bloom_filter_alloc<A>;
3232

33-
// Start with just one simple function
34-
m.def("create_bloom_filter",
35-
[](uint64_t max_distinct_items, double target_false_positive_prob) {
36-
return bloom_filter_type::builder::create_by_accuracy(max_distinct_items, target_false_positive_prob);
37-
},
38-
nb::arg("max_distinct_items"), nb::arg("target_false_positive_prob"),
39-
"Creates a Bloom filter with optimal parameters for the given accuracy requirements");
40-
41-
// Bind the class with minimal methods
33+
// Bind the class with static factory methods only
4234
nb::class_<bloom_filter_type>(m, name)
4335
.def("is_empty", &bloom_filter_type::is_empty,
4436
"Returns True if the filter has seen no items, otherwise False")
@@ -63,7 +55,39 @@ void bind_bloom_filter(nb::module_ &m, const char* name) {
6355
return bloom_filter_type::deserialize(bytes.c_str(), bytes.size());
6456
},
6557
nb::arg("bytes"),
66-
"Reads a bytes object and returns the corresponding bloom_filter");
58+
"Reads a bytes object and returns the corresponding bloom_filter")
59+
.def_static("suggest_num_hashes",
60+
static_cast<uint16_t (*)(uint64_t, uint64_t)>(&bloom_filter_type::builder::suggest_num_hashes),
61+
nb::arg("max_distinct_items"), nb::arg("num_filter_bits"),
62+
"Suggests the optimal number of hash functions for given target numbers of distinct items and filter size")
63+
.def_static("suggest_num_hashes_by_probability",
64+
static_cast<uint16_t (*)(double)>(&bloom_filter_type::builder::suggest_num_hashes),
65+
nb::arg("target_false_positive_prob"),
66+
"Suggests the optimal number of hash functions to achieve a target false positive probability")
67+
.def_static("suggest_num_filter_bits",
68+
&bloom_filter_type::builder::suggest_num_filter_bits,
69+
nb::arg("max_distinct_items"), nb::arg("target_false_positive_prob"),
70+
"Suggests the optimal number of bits for given target numbers of distinct items and false positive probability")
71+
.def_static("create_by_accuracy",
72+
[](uint64_t max_distinct_items, double target_false_positive_prob, uint64_t seed) {
73+
return bloom_filter_type::builder::create_by_accuracy(max_distinct_items, target_false_positive_prob, seed);
74+
},
75+
nb::arg("max_distinct_items"), nb::arg("target_false_positive_prob"), nb::arg("seed")=bloom_filter_type::builder::generate_random_seed(),
76+
"Creates a Bloom filter with optimal parameters for the given accuracy requirements\n\n"
77+
":param max_distinct_items: Maximum expected number of distinct items to add to the filter\n:type max_distinct_items: int\n"
78+
":param target_false_positive_prob: Desired false positive probability per item\n:type target_false_positive_prob: float\n"
79+
":param seed: Hash seed to use (default: random)\n:type seed: int, optional"
80+
)
81+
.def_static("create_by_size",
82+
[](uint64_t num_bits, uint16_t num_hashes, uint64_t seed) {
83+
return bloom_filter_type::builder::create_by_size(num_bits, num_hashes, seed);
84+
},
85+
nb::arg("num_bits"), nb::arg("num_hashes"), nb::arg("seed")=bloom_filter_type::builder::generate_random_seed(),
86+
"Creates a Bloom filter with specified size parameters\n\n"
87+
":param num_bits: Size of the Bloom filter in bits\n:type num_bits: int\n"
88+
":param num_hashes: Number of hash functions to apply to items\n:type num_hashes: int\n"
89+
":param seed: Hash seed to use (default: random)\n:type seed: int, optional"
90+
);
6791
}
6892

6993
void init_bloom_filter(nb::module_ &m) {

tests/bloom_filter_test.py

Lines changed: 89 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -14,65 +14,101 @@
1414
# KIND, either express or implied. See the License for the
1515
# specific language governing permissions and limitations
1616
# under the License.
17-
17+
1818
import unittest
19-
from datasketches import create_bloom_filter
19+
from datasketches import bloom_filter
2020

2121
class BloomFilterTest(unittest.TestCase):
22-
def test_create_bloom_filter(self):
23-
"""Test that we can create a bloom filter with basic parameters"""
24-
bf = create_bloom_filter(1000, 0.01)
25-
self.assertIsNotNone(bf)
26-
self.assertTrue(bf.is_empty())
27-
28-
def test_bloom_filter_empty_state(self):
29-
"""Test that newly created bloom filter is empty"""
30-
bf = create_bloom_filter(100, 0.05)
31-
self.assertTrue(bf.is_empty())
32-
33-
def test_bloom_filter_update_and_query(self):
34-
"""Test basic update and query functionality"""
35-
bf = create_bloom_filter(1000, 0.01)
36-
37-
# Initially empty
38-
self.assertTrue(bf.is_empty())
39-
self.assertFalse(bf.query("test_item"))
40-
41-
# Add an item
42-
bf.update("test_item")
43-
self.assertFalse(bf.is_empty())
44-
self.assertTrue(bf.query("test_item"))
45-
46-
# Query for item not in filter
47-
self.assertFalse(bf.query("other_item"))
48-
49-
def test_bloom_filter_serialize_deserialize(self):
50-
"""Test that we can serialize a bloom filter and restore it afterwards"""
51-
bf = create_bloom_filter(1000, 0.01)
52-
bf.update("test_item")
53-
serialized = bf.serialize()
54-
self.assertIsNotNone(serialized)
55-
self.assertTrue(len(serialized) > 0)
56-
57-
bf = create_bloom_filter(1000, 0.01)
58-
items = ["alpha", "beta", "gamma"]
59-
for it in items:
60-
bf.update(it)
61-
62-
payload = bf.serialize()
63-
self.assertTrue(len(payload) > 0)
64-
self.assertEqual(bf.get_serialized_size_bytes(), len(payload))
65-
66-
restored = bf.deserialize(payload)
67-
self.assertFalse(restored.is_empty())
22+
def test_bloom_filter_accuracy_constructor(self):
23+
# Test the accuracy-based constructor (max_distinct_items, target_false_positive_prob, seed)
24+
max_distinct_items = 1000
25+
target_false_positive_prob = 0.01
26+
27+
# Create bloom filter using accuracy parameters
28+
bf = bloom_filter.create_by_accuracy(max_distinct_items, target_false_positive_prob)
29+
self.assertTrue(bf.is_empty())
30+
31+
# Add some items
32+
test_items = ["item1", "item2", "item3", "item4", "item5"]
33+
for item in test_items:
34+
bf.update(item)
35+
36+
self.assertFalse(bf.is_empty())
37+
38+
# Query items that were added
39+
for item in test_items:
40+
self.assertTrue(bf.query(item))
41+
42+
# Query items that were not added (should mostly return False, but may have false positives)
43+
non_existent_items = ["not_item1", "not_item2", "not_item3"]
44+
for item in non_existent_items:
45+
# We can't assert False here due to false positive possibility
46+
# Just verify the method works
47+
result = bf.query(item)
48+
self.assertIsInstance(result, bool)
6849

69-
# Inserted items should come back as "might be present" (very high probability true)
70-
for it in items:
71-
self.assertTrue(restored.query(it), f"Expected present after round-trip: {it}")
50+
def test_bloom_filter_size_constructor(self):
51+
# Test the size-based constructor (num_bits, num_hashes, seed)
52+
num_bits = 8192 # 8KB in bits
53+
num_hashes = 5
54+
55+
# Create bloom filter using size parameters
56+
bf = bloom_filter.create_by_size(num_bits, num_hashes)
57+
self.assertTrue(bf.is_empty())
58+
59+
# Add some items
60+
test_items = ["item1", "item2", "item3"]
61+
for item in test_items:
62+
bf.update(item)
63+
64+
self.assertFalse(bf.is_empty())
65+
66+
# Query items that were added
67+
for item in test_items:
68+
self.assertTrue(bf.query(item))
7269

73-
# A not-inserted key should usually be absent (Bloom could FP, but unlikely here)
74-
self.assertFalse(restored.query("not_inserted"))
70+
def test_bloom_filter_static_methods(self):
71+
# Test the static helper methods
72+
max_distinct_items = 1000
73+
target_false_positive_prob = 0.01
74+
75+
# Test suggest_num_hashes_by_probability
76+
num_hashes = bloom_filter.suggest_num_hashes_by_probability(target_false_positive_prob)
77+
self.assertIsInstance(num_hashes, int)
78+
self.assertGreater(num_hashes, 0)
79+
80+
# Test suggest_num_filter_bits
81+
num_bits = bloom_filter.suggest_num_filter_bits(max_distinct_items, target_false_positive_prob)
82+
self.assertIsInstance(num_bits, int)
83+
self.assertGreater(num_bits, 0)
84+
85+
# Test suggest_num_hashes with both parameters
86+
num_hashes_alt = bloom_filter.suggest_num_hashes(max_distinct_items, num_bits)
87+
self.assertIsInstance(num_hashes_alt, int)
88+
self.assertGreater(num_hashes_alt, 0)
7589

90+
def test_bloom_filter_serialization(self):
91+
# Test serialization and deserialization
92+
bf = bloom_filter.create_by_accuracy(1000, 0.01)
93+
94+
# Add some items
95+
test_items = ["item1", "item2", "item3"]
96+
for item in test_items:
97+
bf.update(item)
98+
99+
# Serialize
100+
bf_bytes = bf.serialize()
101+
self.assertEqual(bf.get_serialized_size_bytes(), len(bf_bytes))
102+
103+
# Deserialize
104+
new_bf = bloom_filter.deserialize(bf_bytes)
105+
106+
# Verify the deserialized filter has the same behavior
107+
for item in test_items:
108+
self.assertTrue(new_bf.query(item))
109+
110+
# Verify it's not empty
111+
self.assertFalse(new_bf.is_empty())
76112

77113
if __name__ == '__main__':
78114
unittest.main()

0 commit comments

Comments
 (0)