1414# KIND, either express or implied. See the License for the
1515# specific language governing permissions and limitations
1616# under the License.
17-
17+
1818import unittest
19- from datasketches import create_bloom_filter
19+ from datasketches import bloom_filter
2020
2121class BloomFilterTest (unittest .TestCase ):
22- def test_create_bloom_filter (self ):
23- """Test that we can create a bloom filter with basic parameters"""
24- bf = create_bloom_filter (1000 , 0.01 )
25- self .assertIsNotNone (bf )
26- self .assertTrue (bf .is_empty ())
27-
28- def test_bloom_filter_empty_state (self ):
29- """Test that newly created bloom filter is empty"""
30- bf = create_bloom_filter (100 , 0.05 )
31- self .assertTrue (bf .is_empty ())
32-
33- def test_bloom_filter_update_and_query (self ):
34- """Test basic update and query functionality"""
35- bf = create_bloom_filter (1000 , 0.01 )
36-
37- # Initially empty
38- self .assertTrue (bf .is_empty ())
39- self .assertFalse (bf .query ("test_item" ))
40-
41- # Add an item
42- bf .update ("test_item" )
43- self .assertFalse (bf .is_empty ())
44- self .assertTrue (bf .query ("test_item" ))
45-
46- # Query for item not in filter
47- self .assertFalse (bf .query ("other_item" ))
48-
49- def test_bloom_filter_serialize_deserialize (self ):
50- """Test that we can serialize a bloom filter and restore it afterwards"""
51- bf = create_bloom_filter (1000 , 0.01 )
52- bf .update ("test_item" )
53- serialized = bf .serialize ()
54- self .assertIsNotNone (serialized )
55- self .assertTrue (len (serialized ) > 0 )
56-
57- bf = create_bloom_filter (1000 , 0.01 )
58- items = ["alpha" , "beta" , "gamma" ]
59- for it in items :
60- bf .update (it )
61-
62- payload = bf .serialize ()
63- self .assertTrue (len (payload ) > 0 )
64- self .assertEqual (bf .get_serialized_size_bytes (), len (payload ))
65-
66- restored = bf .deserialize (payload )
67- self .assertFalse (restored .is_empty ())
22+ def test_bloom_filter_accuracy_constructor (self ):
23+ # Test the accuracy-based constructor (max_distinct_items, target_false_positive_prob, seed)
24+ max_distinct_items = 1000
25+ target_false_positive_prob = 0.01
26+
27+ # Create bloom filter using accuracy parameters
28+ bf = bloom_filter .create_by_accuracy (max_distinct_items , target_false_positive_prob )
29+ self .assertTrue (bf .is_empty ())
30+
31+ # Add some items
32+ test_items = ["item1" , "item2" , "item3" , "item4" , "item5" ]
33+ for item in test_items :
34+ bf .update (item )
35+
36+ self .assertFalse (bf .is_empty ())
37+
38+ # Query items that were added
39+ for item in test_items :
40+ self .assertTrue (bf .query (item ))
41+
42+ # Query items that were not added (should mostly return False, but may have false positives)
43+ non_existent_items = ["not_item1" , "not_item2" , "not_item3" ]
44+ for item in non_existent_items :
45+ # We can't assert False here due to false positive possibility
46+ # Just verify the method works
47+ result = bf .query (item )
48+ self .assertIsInstance (result , bool )
6849
69- # Inserted items should come back as "might be present" (very high probability true)
70- for it in items :
71- self .assertTrue (restored .query (it ), f"Expected present after round-trip: { it } " )
50+ def test_bloom_filter_size_constructor (self ):
51+ # Test the size-based constructor (num_bits, num_hashes, seed)
52+ num_bits = 8192 # 8KB in bits
53+ num_hashes = 5
54+
55+ # Create bloom filter using size parameters
56+ bf = bloom_filter .create_by_size (num_bits , num_hashes )
57+ self .assertTrue (bf .is_empty ())
58+
59+ # Add some items
60+ test_items = ["item1" , "item2" , "item3" ]
61+ for item in test_items :
62+ bf .update (item )
63+
64+ self .assertFalse (bf .is_empty ())
65+
66+ # Query items that were added
67+ for item in test_items :
68+ self .assertTrue (bf .query (item ))
7269
73- # A not-inserted key should usually be absent (Bloom could FP, but unlikely here)
74- self .assertFalse (restored .query ("not_inserted" ))
70+ def test_bloom_filter_static_methods (self ):
71+ # Test the static helper methods
72+ max_distinct_items = 1000
73+ target_false_positive_prob = 0.01
74+
75+ # Test suggest_num_hashes_by_probability
76+ num_hashes = bloom_filter .suggest_num_hashes_by_probability (target_false_positive_prob )
77+ self .assertIsInstance (num_hashes , int )
78+ self .assertGreater (num_hashes , 0 )
79+
80+ # Test suggest_num_filter_bits
81+ num_bits = bloom_filter .suggest_num_filter_bits (max_distinct_items , target_false_positive_prob )
82+ self .assertIsInstance (num_bits , int )
83+ self .assertGreater (num_bits , 0 )
84+
85+ # Test suggest_num_hashes with both parameters
86+ num_hashes_alt = bloom_filter .suggest_num_hashes (max_distinct_items , num_bits )
87+ self .assertIsInstance (num_hashes_alt , int )
88+ self .assertGreater (num_hashes_alt , 0 )
7589
90+ def test_bloom_filter_serialization (self ):
91+ # Test serialization and deserialization
92+ bf = bloom_filter .create_by_accuracy (1000 , 0.01 )
93+
94+ # Add some items
95+ test_items = ["item1" , "item2" , "item3" ]
96+ for item in test_items :
97+ bf .update (item )
98+
99+ # Serialize
100+ bf_bytes = bf .serialize ()
101+ self .assertEqual (bf .get_serialized_size_bytes (), len (bf_bytes ))
102+
103+ # Deserialize
104+ new_bf = bloom_filter .deserialize (bf_bytes )
105+
106+ # Verify the deserialized filter has the same behavior
107+ for item in test_items :
108+ self .assertTrue (new_bf .query (item ))
109+
110+ # Verify it's not empty
111+ self .assertFalse (new_bf .is_empty ())
76112
77113if __name__ == '__main__' :
78114 unittest .main ()
0 commit comments