1919from datasketches import bloom_filter
2020
2121class BloomFilterTest (unittest .TestCase ):
22+ def test_standard_constructors (self ):
23+ """Test standard constructors with exact validation."""
24+ num_items = 4000
25+ fpp = 0.01
26+
27+ num_bits = bloom_filter .suggest_num_filter_bits (num_items , fpp )
28+ num_hashes = bloom_filter .suggest_num_hashes (num_items , num_bits )
29+ seed = 89023
30+
31+ bf = bloom_filter .create_by_size (num_bits , num_hashes , seed )
32+ # C++ rounds up to nearest multiple of 64
33+ adjusted_num_bits = (num_bits + 63 ) & ~ 0x3F
34+ self .assertEqual (bf .capacity , adjusted_num_bits )
35+ self .assertEqual (bf .num_hashes , num_hashes )
36+ self .assertEqual (bf .seed , seed )
37+ self .assertTrue (bf .is_empty ())
38+
39+ # Should match above
40+ bf2 = bloom_filter .create_by_accuracy (num_items , fpp , seed )
41+ self .assertEqual (bf2 .capacity , adjusted_num_bits )
42+ self .assertEqual (bf2 .num_hashes , num_hashes )
43+ self .assertEqual (bf2 .seed , seed )
44+ self .assertTrue (bf2 .is_empty ())
45+
46+ def test_basic_operations (self ):
47+ """Test basic operations with validation."""
48+ num_items = 5000
49+ fpp = 0.01
50+ seed = 4897301548054
51+
52+ bf = bloom_filter .create_by_accuracy (num_items , fpp , seed )
53+ self .assertTrue (bf .is_empty ())
54+ self .assertEqual (bf .num_bits_used , 0 )
55+
56+ # Add items
57+ for i in range (num_items ):
58+ bf .update (str (i ))
59+
60+ self .assertFalse (bf .is_empty ())
61+ self .assertGreater (bf .num_bits_used , 0 )
62+ self .assertLessEqual (bf .num_bits_used , bf .capacity )
63+
64+ # Count false positives
65+ false_positives = 0
66+ for i in range (num_items , min (num_items + 1000 , bf .capacity )):
67+ if bf .query (str (i )):
68+ false_positives += 1
69+
70+ self .assertGreater (false_positives , 0 )
71+ self .assertLess (false_positives , 100 )
72+
73+ # Test serialization
74+ bf_bytes = bf .serialize ()
75+ self .assertEqual (bf .get_serialized_size_bytes (), len (bf_bytes ))
76+
77+ new_bf = bloom_filter .deserialize (bf_bytes )
78+ self .assertEqual (bf .capacity , new_bf .capacity )
79+ self .assertEqual (bf .num_hashes , new_bf .num_hashes )
80+ self .assertEqual (bf .seed , new_bf .seed )
81+ self .assertEqual (bf .num_bits_used , new_bf .num_bits_used )
82+
83+ # Verify all original items are still found
84+ for i in range (num_items ):
85+ self .assertTrue (new_bf .query (str (i )))
86+
87+ def test_reset_method (self ):
88+ """Test the reset method functionality."""
89+ bf = bloom_filter .create_by_accuracy (1000 , 0.01 )
90+
91+ # Initially empty
92+ self .assertTrue (bf .is_empty ())
93+ self .assertEqual (bf .num_bits_used , 0 )
94+
95+ # Add some items
96+ test_items = ["item1" , "item2" , "item3" , "item4" , "item5" ]
97+ for item in test_items :
98+ bf .update (item )
99+
100+ # Verify items were added
101+ self .assertFalse (bf .is_empty ())
102+ self .assertGreater (bf .num_bits_used , 0 )
103+
104+ for item in test_items :
105+ self .assertTrue (bf .query (item ))
106+
107+ # Reset the filter
108+ bf .reset ()
109+
110+ # Verify filter is back to empty state
111+ self .assertTrue (bf .is_empty ())
112+ self .assertEqual (bf .num_bits_used , 0 )
113+
114+ # Verify none of the original items are found
115+ for item in test_items :
116+ self .assertFalse (bf .query (item ))
117+
118+ # Verify properties are preserved
119+ self .assertGreater (bf .capacity , 0 )
120+ self .assertGreater (bf .num_hashes , 0 )
121+ self .assertIsInstance (bf .seed , int )
122+
123+ # Can add new items after reset
124+ bf .update ("new_item" )
125+ self .assertFalse (bf .is_empty ())
126+ self .assertTrue (bf .query ("new_item" ))
127+
22128 def test_bloom_filter_accuracy_constructor (self ):
23- # Test the accuracy-based constructor (max_distinct_items, target_false_positive_prob, seed)
129+ """ Test the accuracy-based constructor."""
24130 max_distinct_items = 1000
25131 target_false_positive_prob = 0.01
26132
27- # Create bloom filter using accuracy parameters
28133 bf = bloom_filter .create_by_accuracy (max_distinct_items , target_false_positive_prob )
29134 self .assertTrue (bf .is_empty ())
30135
31- # Add some items
32136 test_items = ["item1" , "item2" , "item3" , "item4" , "item5" ]
33137 for item in test_items :
34138 bf .update (item )
35139
36140 self .assertFalse (bf .is_empty ())
37141
38- # Query items that were added
39142 for item in test_items :
40143 self .assertTrue (bf .query (item ))
41-
42- # Query items that were not added (should mostly return False, but may have false positives)
43- non_existent_items = ["not_item1" , "not_item2" , "not_item3" ]
44- for item in non_existent_items :
45- # We can't assert False here due to false positive possibility
46- # Just verify the method works
47- result = bf .query (item )
48- self .assertIsInstance (result , bool )
49144
50145 def test_bloom_filter_size_constructor (self ):
51- # Test the size-based constructor (num_bits, num_hashes, seed)
52- num_bits = 8192 # 8KB in bits
146+ """ Test the size-based constructor."""
147+ num_bits = 8192
53148 num_hashes = 5
54149
55- # Create bloom filter using size parameters
56150 bf = bloom_filter .create_by_size (num_bits , num_hashes )
57151 self .assertTrue (bf .is_empty ())
152+ self .assertEqual (bf .capacity , num_bits )
153+ self .assertEqual (bf .num_hashes , num_hashes )
58154
59- # Add some items
60155 test_items = ["item1" , "item2" , "item3" ]
61156 for item in test_items :
62157 bf .update (item )
63158
64159 self .assertFalse (bf .is_empty ())
65160
66- # Query items that were added
67161 for item in test_items :
68162 self .assertTrue (bf .query (item ))
69163
70164 def test_bloom_filter_static_methods (self ):
71- # Test the static helper methods
165+ """ Test the static helper methods."""
72166 max_distinct_items = 1000
73167 target_false_positive_prob = 0.01
74168
75- # Test suggest_num_hashes_by_probability
76169 num_hashes = bloom_filter .suggest_num_hashes_by_probability (target_false_positive_prob )
77170 self .assertIsInstance (num_hashes , int )
78171 self .assertGreater (num_hashes , 0 )
79172
80- # Test suggest_num_filter_bits
81173 num_bits = bloom_filter .suggest_num_filter_bits (max_distinct_items , target_false_positive_prob )
82174 self .assertIsInstance (num_bits , int )
83175 self .assertGreater (num_bits , 0 )
84176
85- # Test suggest_num_hashes with both parameters
86177 num_hashes_alt = bloom_filter .suggest_num_hashes (max_distinct_items , num_bits )
87178 self .assertIsInstance (num_hashes_alt , int )
88179 self .assertGreater (num_hashes_alt , 0 )
89180
90181 def test_bloom_filter_serialization (self ):
91- # Test serialization and deserialization
182+ """ Test serialization and deserialization."""
92183 bf = bloom_filter .create_by_accuracy (1000 , 0.01 )
93184
94- # Add some items
95185 test_items = ["item1" , "item2" , "item3" ]
96186 for item in test_items :
97187 bf .update (item )
98188
99- # Serialize
100189 bf_bytes = bf .serialize ()
101190 self .assertEqual (bf .get_serialized_size_bytes (), len (bf_bytes ))
102191
103- # Deserialize
104192 new_bf = bloom_filter .deserialize (bf_bytes )
105193
106- # Verify the deserialized filter has the same behavior
107194 for item in test_items :
108195 self .assertTrue (new_bf .query (item ))
109196
110- # Verify it's not empty
111197 self .assertFalse (new_bf .is_empty ())
198+ self .assertEqual (bf .capacity , new_bf .capacity )
199+ self .assertEqual (bf .num_hashes , new_bf .num_hashes )
200+ self .assertEqual (bf .seed , new_bf .seed )
201+ self .assertEqual (bf .num_bits_used , new_bf .num_bits_used )
202+
203+ def test_empty_serialization (self ):
204+ """Test serialization of empty bloom filter."""
205+ bf = bloom_filter .create_by_accuracy (1000 , 0.01 )
206+ self .assertTrue (bf .is_empty ())
207+
208+ bf_bytes = bf .serialize ()
209+ self .assertEqual (bf .get_serialized_size_bytes (), len (bf_bytes ))
210+
211+ new_bf = bloom_filter .deserialize (bf_bytes )
212+ self .assertTrue (new_bf .is_empty ())
213+ self .assertEqual (bf .capacity , new_bf .capacity )
214+ self .assertEqual (bf .num_hashes , new_bf .num_hashes )
215+ self .assertEqual (bf .seed , new_bf .seed )
216+
217+ def test_bits_used_properties (self ):
218+ """Test that bits_used behaves correctly."""
219+ bf = bloom_filter .create_by_accuracy (1000 , 0.01 )
220+
221+ self .assertEqual (bf .num_bits_used , 0 )
222+
223+ bf .update ("alpha" )
224+ bits1 = bf .num_bits_used
225+ self .assertIsInstance (bits1 , int )
226+ self .assertGreater (bits1 , 0 )
227+
228+ # Idempotent
229+ bits1_again = bf .num_bits_used
230+ self .assertEqual (bits1_again , bits1 )
231+
232+ # Same item shouldn't change bits_used
233+ bf .update ("alpha" )
234+ bits_dup = bf .num_bits_used
235+ self .assertEqual (bits_dup , bits1 )
236+
237+ # Additional items should be non-decreasing
238+ for s in ["beta" , "gamma" , "delta" ]:
239+ bf .update (s )
240+ new_bits = bf .num_bits_used
241+ self .assertGreaterEqual (new_bits , bits1 )
242+ bits1 = new_bits
243+
244+ def test_capacity_properties (self ):
245+ """Test that capacity is positive and constant."""
246+ bf = bloom_filter .create_by_accuracy (1000 , 0.01 )
247+ cap1 = bf .capacity
248+ self .assertIsInstance (cap1 , int )
249+ self .assertGreater (cap1 , 0 )
250+
251+ cap2 = bf .capacity
252+ self .assertEqual (cap1 , cap2 )
253+
254+ bf .update ("alpha" )
255+ bf .update ("beta" )
256+ self .assertEqual (cap1 , bf .capacity )
257+
258+ def test_num_hashes_properties (self ):
259+ """Test that num_hashes is consistent."""
260+ bf = bloom_filter .create_by_accuracy (1000 , 0.01 )
261+ k1 = bf .num_hashes
262+ self .assertIsInstance (k1 , int )
263+ self .assertGreaterEqual (k1 , 1 )
264+ bf .update ("alpha" )
265+ self .assertEqual (k1 , bf .num_hashes )
266+
267+ bf2 = bloom_filter .create_by_size (10000 , 3 )
268+ k2 = bf2 .num_hashes
269+ self .assertIsInstance (k2 , int )
270+ self .assertEqual (k2 , 3 )
271+ bf2 .update ("beta" )
272+ self .assertEqual (k2 , bf2 .num_hashes )
273+
274+ def test_seed_properties (self ):
275+ """Test that seed is consistent."""
276+ bf = bloom_filter .create_by_accuracy (1000 , 0.01 )
277+ s1 = bf .seed
278+ self .assertIsInstance (s1 , int )
279+
280+ bf .update ("alpha" )
281+ self .assertEqual (s1 , bf .seed )
282+
283+ explicit_seed = 12345
284+ bf2 = bloom_filter .create_by_accuracy (1000 , 0.01 , explicit_seed )
285+ s2 = bf2 .seed
286+ self .assertIsInstance (s2 , int )
287+ self .assertEqual (s2 , explicit_seed )
288+
289+ def test_deterministic_behavior (self ):
290+ """Test that bloom filters with the same seed behave deterministically."""
291+ seed = 12345
292+ max_distinct_items = 1000
293+ target_fpp = 0.01
294+
295+ bf1 = bloom_filter .create_by_accuracy (max_distinct_items , target_fpp , seed )
296+ bf2 = bloom_filter .create_by_accuracy (max_distinct_items , target_fpp , seed )
297+
298+ test_items = [f"item_{ i } " for i in range (100 )]
299+ for item in test_items :
300+ bf1 .update (item )
301+ bf2 .update (item )
302+
303+ self .assertEqual (bf1 .capacity , bf2 .capacity )
304+ self .assertEqual (bf1 .num_hashes , bf2 .num_hashes )
305+ self .assertEqual (bf1 .seed , bf2 .seed )
306+ self .assertEqual (bf1 .num_bits_used , bf2 .num_bits_used )
307+
308+ for item in test_items :
309+ self .assertEqual (bf1 .query (item ), bf2 .query (item ))
310+
311+ def test_edge_cases (self ):
312+ """Test edge cases and boundary conditions."""
313+ bf_small = bloom_filter .create_by_size (64 , 1 )
314+ self .assertEqual (bf_small .capacity , 64 )
315+ self .assertEqual (bf_small .num_hashes , 1 )
316+
317+ bf_many_hashes = bloom_filter .create_by_size (1000 , 20 )
318+ self .assertEqual (bf_many_hashes .num_hashes , 20 )
319+
320+ bf_low_fpp = bloom_filter .create_by_accuracy (100 , 1e-6 )
321+ self .assertGreater (bf_low_fpp .num_hashes , 1 )
322+
323+ def test_mathematical_properties (self ):
324+ """Test mathematical properties of bloom filters."""
325+ bf = bloom_filter .create_by_size (100 , 5 )
326+ self .assertEqual (bf .capacity % 64 , 0 )
327+
328+ bf .update ("test_item" )
329+ self .assertLessEqual (bf .num_bits_used , bf .capacity )
330+
331+ self .assertGreater (bf .num_hashes , 0 )
332+ self .assertIsInstance (bf .seed , int )
112333
113334if __name__ == '__main__' :
114335 unittest .main ()
0 commit comments