66#include " common/bloom_filter.hpp"
77
88#include < cmath>
9+ #include < cstdint>
10+
11+ #if defined(__APPLE__)
12+ #include < libkern/OSByteOrder.h>
13+ #define bswap64 (x ) OSSwapInt64(x)
14+ #else
15+ #include < endian.h>
16+ #define bswap64 (x ) __builtin_bswap64(x)
17+ #endif
918
1019namespace cloudsql ::common {
1120
1221BloomFilter::BloomFilter (size_t expected_elements, double false_positive_rate)
1322 : expected_elements_(expected_elements) {
23+ // Handle zero expected_elements as empty filter
24+ if (expected_elements == 0 ) {
25+ num_bits_ = 0 ;
26+ num_hashes_ = 0 ;
27+ return ;
28+ }
29+
30+ // Clamp false_positive_rate to safe range [0.001, 0.99]
31+ double p = false_positive_rate;
32+ if (p <= 0.0 || p >= 1.0 ) {
33+ p = 0.01 ; // Safe default
34+ }
35+
1436 // m = -n * ln(p) / (ln(2)^2)
1537 // k = m/n * ln(2)
1638 double n = static_cast <double >(expected_elements);
17- double p = false_positive_rate;
1839
1940 double m = -n * std::log (p) / (std::log (2 ) * std::log (2 ));
2041 double k = (m / n) * std::log (2 );
@@ -37,25 +58,45 @@ BloomFilter::BloomFilter(size_t expected_elements, double false_positive_rate)
3758}
3859
3960BloomFilter::BloomFilter (const uint8_t * data, size_t size) {
40- if (size < sizeof (size_t ) * 3 ) {
61+ // Minimum size: 3 x uint64_t header + at least 1 byte of bits
62+ if (size < sizeof (uint64_t ) * 3 + 1 ) {
4163 return ; // Invalid data
4264 }
4365
4466 size_t offset = 0 ;
45- std::memcpy (&num_bits_, data + offset, sizeof (size_t ));
46- offset += sizeof (size_t );
47-
48- std::memcpy (&num_hashes_, data + offset, sizeof (size_t ));
49- offset += sizeof (size_t );
50-
51- std::memcpy (&expected_elements_, data + offset, sizeof (size_t ));
52- offset += sizeof (size_t );
5367
68+ // Read with fixed-width uint64_t and proper byte-order conversion
69+ uint64_t tmp_num_bits = 0 ;
70+ std::memcpy (&tmp_num_bits, data + offset, sizeof (uint64_t ));
71+ tmp_num_bits = bswap64 (tmp_num_bits);
72+ num_bits_ = static_cast <size_t >(tmp_num_bits);
73+ offset += sizeof (uint64_t );
74+
75+ uint64_t tmp_num_hashes = 0 ;
76+ std::memcpy (&tmp_num_hashes, data + offset, sizeof (uint64_t ));
77+ tmp_num_hashes = bswap64 (tmp_num_hashes);
78+ num_hashes_ = static_cast <size_t >(tmp_num_hashes);
79+ offset += sizeof (uint64_t );
80+
81+ uint64_t tmp_expected = 0 ;
82+ std::memcpy (&tmp_expected, data + offset, sizeof (uint64_t ));
83+ tmp_expected = bswap64 (tmp_expected);
84+ expected_elements_ = static_cast <size_t >(tmp_expected);
85+ offset += sizeof (uint64_t );
86+
87+ // Validate bit array size
5488 size_t bit_bytes = (num_bits_ + 7 ) / 8 ;
55- if (size >= offset + bit_bytes) {
56- bits_.resize (bit_bytes);
57- std::memcpy (bits_.data (), data + offset, bit_bytes);
89+ if (size < offset + bit_bytes) {
90+ // Truncated payload - reset to safe empty state
91+ num_bits_ = 0 ;
92+ num_hashes_ = 0 ;
93+ expected_elements_ = 0 ;
94+ bits_.clear ();
95+ return ;
5896 }
97+
98+ bits_.resize (bit_bytes);
99+ std::memcpy (bits_.data (), data + offset, bit_bytes);
59100}
60101
61102size_t BloomFilter::murmur3_hash (const Value& key) const {
@@ -84,14 +125,21 @@ size_t BloomFilter::murmur3_hash(const uint8_t* data, size_t len, size_t seed) c
84125
85126size_t BloomFilter::get_bit_position (size_t hash, size_t i) const {
86127 // Double hashing technique: h(i) = h1 + i * h2
87- // Use two different hash seeds
128+ // Make h2 key-dependent by rehashing the input hash with a different seed
88129 size_t h1 = hash;
89- size_t h2 = murmur3_hash (reinterpret_cast <const uint8_t *>(" salt" ), 4 , 0xcafebabe );
130+ size_t h2 = murmur3_hash (reinterpret_cast <const uint8_t *>(&hash), sizeof (hash), 0xcafebabe );
131+
132+ // Ensure h2 is non-zero to avoid degenerate probing
133+ if (h2 == 0 ) {
134+ h2 = 1 ;
135+ }
90136
91137 return (h1 + i * h2) % num_bits_;
92138}
93139
94140void BloomFilter::insert (const Value& key) {
141+ if (num_bits_ == 0 ) return ; // Empty filter
142+
95143 size_t base_hash = murmur3_hash (key);
96144
97145 for (size_t i = 0 ; i < num_hashes_; ++i) {
@@ -103,6 +151,8 @@ void BloomFilter::insert(const Value& key) {
103151}
104152
105153bool BloomFilter::might_contain (const Value& key) const {
154+ if (num_bits_ == 0 ) return false ; // Empty filter
155+
106156 size_t base_hash = murmur3_hash (key);
107157
108158 for (size_t i = 0 ; i < num_hashes_; ++i) {
@@ -121,17 +171,21 @@ bool BloomFilter::might_contain(const Value& key) const {
121171std::vector<uint8_t > BloomFilter::serialize () const {
122172 std::vector<uint8_t > out;
123173
124- // Store metadata
125- out.resize (sizeof (size_t ) * 3 );
174+ // Store metadata using fixed-width uint64_t with byte-order conversion
175+ out.resize (sizeof (uint64_t ) * 3 );
126176 size_t offset = 0 ;
127- std::memcpy (out.data () + offset, &num_bits_, sizeof (size_t ));
128- offset += sizeof (size_t );
129177
130- std::memcpy (out.data () + offset, &num_hashes_, sizeof (size_t ));
131- offset += sizeof (size_t );
178+ uint64_t tmp_num_bits = bswap64 (static_cast <uint64_t >(num_bits_));
179+ std::memcpy (out.data () + offset, &tmp_num_bits, sizeof (uint64_t ));
180+ offset += sizeof (uint64_t );
181+
182+ uint64_t tmp_num_hashes = bswap64 (static_cast <uint64_t >(num_hashes_));
183+ std::memcpy (out.data () + offset, &tmp_num_hashes, sizeof (uint64_t ));
184+ offset += sizeof (uint64_t );
132185
133- std::memcpy (out.data () + offset, &expected_elements_, sizeof (size_t ));
134- offset += sizeof (size_t );
186+ uint64_t tmp_expected = bswap64 (static_cast <uint64_t >(expected_elements_));
187+ std::memcpy (out.data () + offset, &tmp_expected, sizeof (uint64_t ));
188+ offset += sizeof (uint64_t );
135189
136190 // Store bits
137191 size_t bit_bytes = (num_bits_ + 7 ) / 8 ;
0 commit comments