Skip to content

Commit df21d49

Browse files
committed
More robust
1 parent 3638b03 commit df21d49

2 files changed

Lines changed: 67 additions & 32 deletions

File tree

include/binaryfusefilter.h

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,22 @@
1313
// highly unlikely
1414
#endif
1515

16+
static int binary_fuse_cmpfunc(const void * a, const void * b) {
17+
return ( *(const uint64_t*)a - *(const uint64_t*)b );
18+
}
19+
20+
static size_t binary_fuse_sort_and_remove_dup(uint64_t* keys, size_t length) {
21+
qsort(keys, length, sizeof(uint64_t), binary_fuse_cmpfunc);
22+
size_t j = 0;
23+
for(size_t i = 1; i < length; i++) {
24+
if(keys[i] != keys[i-1]) {
25+
keys[j] = keys[i];
26+
j++;
27+
}
28+
}
29+
return j+1;
30+
}
31+
1632
/**
1733
* We start with a few utilities.
1834
***/
@@ -257,7 +273,7 @@ static inline uint8_t binary_fuse_mod3(uint8_t x) {
257273
// The caller is responsable for calling binary_fuse8_allocate(size,filter)
258274
// before. For best performance, the caller should ensure that there are not too
259275
// many duplicated keys.
260-
static inline bool binary_fuse8_populate(const uint64_t *keys, uint32_t size,
276+
static inline bool binary_fuse8_populate(uint64_t *keys, uint32_t size,
261277
binary_fuse8_t *filter) {
262278
uint64_t rng_counter = 0x726b2b9d438b9d4d;
263279
filter->Seed = binary_fuse_rng_splitmix64(&rng_counter);
@@ -290,17 +306,15 @@ static inline bool binary_fuse8_populate(const uint64_t *keys, uint32_t size,
290306
for (int loop = 0; true; ++loop) {
291307
if (loop + 1 > XOR_MAX_ITERATIONS) {
292308
// The probability of this happening is lower than the
293-
// the cosmic-ray probability (i.e., a cosmic ray corrupts your system),
294-
// but if it happens, we just fill the fingerprint with ones which
295-
// will flag all possible keys as 'possible', ensuring a correct result.
309+
// the cosmic-ray probability (i.e., a cosmic ray corrupts your system)
296310
memset(filter->Fingerprints, ~0, filter->ArrayLength);
297311
free(alone);
298312
free(t2count);
299313
free(reverseH);
300314
free(t2hash);
301315
free(reverseOrder);
302316
free(startPos);
303-
return true;
317+
return false;
304318
}
305319

306320
for (uint32_t i = 0; i < block; i++) {
@@ -405,6 +419,8 @@ static inline bool binary_fuse8_populate(const uint64_t *keys, uint32_t size,
405419
// success
406420
size = stacksize;
407421
break;
422+
} else if(duplicates > 0) {
423+
size = binary_fuse_sort_and_remove_dup(keys, size);
408424
}
409425
memset(reverseOrder, 0, sizeof(uint64_t) * size);
410426
memset(t2count, 0, sizeof(uint8_t) * capacity);
@@ -541,7 +557,7 @@ static inline void binary_fuse16_free(binary_fuse16_t *filter) {
541557
// The caller is responsable for calling binary_fuse8_allocate(size,filter)
542558
// before. For best performance, the caller should ensure that there are not too
543559
// many duplicated keys.
544-
static inline bool binary_fuse16_populate(const uint64_t *keys, uint32_t size,
560+
static inline bool binary_fuse16_populate(uint64_t *keys, uint32_t size,
545561
binary_fuse16_t *filter) {
546562
uint64_t rng_counter = 0x726b2b9d438b9d4d;
547563
filter->Seed = binary_fuse_rng_splitmix64(&rng_counter);
@@ -574,17 +590,14 @@ static inline bool binary_fuse16_populate(const uint64_t *keys, uint32_t size,
574590
for (int loop = 0; true; ++loop) {
575591
if (loop + 1 > XOR_MAX_ITERATIONS) {
576592
// The probability of this happening is lower than the
577-
// the cosmic-ray probability (i.e., a cosmic ray corrupts your system),
578-
// but if it happens, we just fill the fingerprint with ones which
579-
// will flag all possible keys as 'possible', ensuring a correct result.
580-
memset(filter->Fingerprints, ~0, filter->ArrayLength * sizeof(uint16_t));
593+
// the cosmic-ray probability (i.e., a cosmic ray corrupts your system).
581594
free(alone);
582595
free(t2count);
583596
free(reverseH);
584597
free(t2hash);
585598
free(reverseOrder);
586599
free(startPos);
587-
return true;
600+
return false;
588601
}
589602

590603
for (uint32_t i = 0; i < block; i++) {
@@ -689,6 +702,8 @@ static inline bool binary_fuse16_populate(const uint64_t *keys, uint32_t size,
689702
// success
690703
size = stacksize;
691704
break;
705+
} else if(duplicates > 0) {
706+
size = binary_fuse_sort_and_remove_dup(keys, size);
692707
}
693708
memset(reverseOrder, 0, sizeof(uint64_t) * size);
694709
memset(t2count, 0, sizeof(uint8_t) * capacity);

include/xorfilter.h

Lines changed: 41 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,30 @@
77
#include <stdlib.h>
88
#include <string.h>
99

10+
#ifndef XOR_SORT_ITERATIONS
11+
#define XOR_SORT_ITERATIONS 10 // after 10 iterations, we sort and remove duplicates
12+
#endif
13+
1014
#ifndef XOR_MAX_ITERATIONS
1115
#define XOR_MAX_ITERATIONS 100 // probabillity of success should always be > 0.5 so 100 iterations is highly unlikely
1216
#endif
1317

18+
19+
static int xor_cmpfunc(const void * a, const void * b) {
20+
return ( *(const uint64_t*)a - *(const uint64_t*)b );
21+
}
22+
23+
static size_t xor_sort_and_remove_dup(uint64_t* keys, size_t length) {
24+
qsort(keys, length, sizeof(uint64_t), xor_cmpfunc);
25+
size_t j = 0;
26+
for(size_t i = 1; i < length; i++) {
27+
if(keys[i] != keys[i-1]) {
28+
keys[j] = keys[i];
29+
j++;
30+
}
31+
}
32+
return j+1;
33+
}
1434
/**
1535
* We assume that you have a large set of 64-bit integers
1636
* and you want a data structure to do membership tests using
@@ -424,7 +444,7 @@ static inline uint32_t xor_flushone_decrement_buffer(xor_setbuffer_t *buffer,
424444
// The caller is responsable for calling xor8_allocate(size,filter)
425445
// before. For best performance, the caller should ensure that there are not too
426446
// many duplicated keys.
427-
static inline bool xor8_buffered_populate(const uint64_t *keys, uint32_t size, xor8_t *filter) {
447+
static inline bool xor8_buffered_populate(uint64_t *keys, uint32_t size, xor8_t *filter) {
428448
if(size == 0) { return false; }
429449
uint64_t rng_counter = 1;
430450
filter->seed = xor_rng_splitmix64(&rng_counter);
@@ -470,12 +490,12 @@ static inline bool xor8_buffered_populate(const uint64_t *keys, uint32_t size, x
470490

471491
while (true) {
472492
iterations ++;
493+
if(iterations == XOR_SORT_ITERATIONS) {
494+
size = xor_sort_and_remove_dup(keys, size);
495+
}
473496
if(iterations > XOR_MAX_ITERATIONS) {
474497
// The probability of this happening is lower than the
475-
// the cosmic-ray probability (i.e., a cosmic ray corrupts your system),
476-
// but if it happens, we just fill the fingerprint with ones which
477-
// will flag all possible keys as 'possible', ensuring a correct result.
478-
memset(filter->fingerprints, ~0, 3 * filter->blockLength);
498+
// the cosmic-ray probability (i.e., a cosmic ray corrupts your system).
479499
xor_free_buffer(&buffer0);
480500
xor_free_buffer(&buffer1);
481501
xor_free_buffer(&buffer2);
@@ -635,7 +655,7 @@ static inline bool xor8_buffered_populate(const uint64_t *keys, uint32_t size, x
635655
// The caller is responsable for calling xor8_allocate(size,filter)
636656
// before. For best performance, the caller should ensure that there are not too
637657
// many duplicated keys.
638-
static inline bool xor8_populate(const uint64_t *keys, uint32_t size, xor8_t *filter) {
658+
static inline bool xor8_populate(uint64_t *keys, uint32_t size, xor8_t *filter) {
639659
if(size == 0) { return false; }
640660
uint64_t rng_counter = 1;
641661
filter->seed = xor_rng_splitmix64(&rng_counter);
@@ -668,12 +688,12 @@ static inline bool xor8_populate(const uint64_t *keys, uint32_t size, xor8_t *fi
668688

669689
while (true) {
670690
iterations ++;
691+
if(iterations == XOR_SORT_ITERATIONS) {
692+
size = xor_sort_and_remove_dup(keys, size);
693+
}
671694
if(iterations > XOR_MAX_ITERATIONS) {
672695
// The probability of this happening is lower than the
673-
// the cosmic-ray probability (i.e., a cosmic ray corrupts your system),
674-
// but if it happens, we just fill the fingerprint with ones which
675-
// will flag all possible keys as 'possible', ensuring a correct result.
676-
memset(filter->fingerprints, ~0, 3 * filter->blockLength);
696+
// the cosmic-ray probability (i.e., a cosmic ray corrupts your system).
677697
free(sets);
678698
free(Q);
679699
free(stack);
@@ -842,7 +862,7 @@ static inline bool xor8_populate(const uint64_t *keys, uint32_t size, xor8_t *fi
842862
// The caller is responsable for calling xor16_allocate(size,filter)
843863
// before. For best performance, the caller should ensure that there are not too
844864
// many duplicated keys.
845-
static inline bool xor16_buffered_populate(const uint64_t *keys, uint32_t size, xor16_t *filter) {
865+
static inline bool xor16_buffered_populate(uint64_t *keys, uint32_t size, xor16_t *filter) {
846866
if(size == 0) { return false; }
847867
uint64_t rng_counter = 1;
848868
filter->seed = xor_rng_splitmix64(&rng_counter);
@@ -888,12 +908,12 @@ static inline bool xor16_buffered_populate(const uint64_t *keys, uint32_t size,
888908

889909
while (true) {
890910
iterations ++;
911+
if(iterations == XOR_SORT_ITERATIONS) {
912+
size = xor_sort_and_remove_dup(keys, size);
913+
}
891914
if(iterations > XOR_MAX_ITERATIONS) {
892915
// The probability of this happening is lower than the
893-
// the cosmic-ray probability (i.e., a cosmic ray corrupts your system),
894-
// but if it happens, we just fill the fingerprint with ones which
895-
// will flag all possible keys as 'possible', ensuring a correct result.
896-
memset(filter->fingerprints, ~0, 3 * filter->blockLength * sizeof(uint16_t));
916+
// the cosmic-ray probability (i.e., a cosmic ray corrupts your system)é
897917
xor_free_buffer(&buffer0);
898918
xor_free_buffer(&buffer1);
899919
xor_free_buffer(&buffer2);
@@ -1056,7 +1076,7 @@ static inline bool xor16_buffered_populate(const uint64_t *keys, uint32_t size,
10561076
// The caller is responsable for calling xor16_allocate(size,filter)
10571077
// before. For best performance, the caller should ensure that there are not too
10581078
// many duplicated keys.
1059-
static inline bool xor16_populate(const uint64_t *keys, uint32_t size, xor16_t *filter) {
1079+
static inline bool xor16_populate(uint64_t *keys, uint32_t size, xor16_t *filter) {
10601080
if(size == 0) { return false; }
10611081
uint64_t rng_counter = 1;
10621082
filter->seed = xor_rng_splitmix64(&rng_counter);
@@ -1090,16 +1110,16 @@ static inline bool xor16_populate(const uint64_t *keys, uint32_t size, xor16_t *
10901110

10911111
while (true) {
10921112
iterations ++;
1113+
if(iterations == XOR_SORT_ITERATIONS) {
1114+
size = xor_sort_and_remove_dup(keys, size);
1115+
}
10931116
if(iterations > XOR_MAX_ITERATIONS) {
10941117
// The probability of this happening is lower than the
1095-
// the cosmic-ray probability (i.e., a cosmic ray corrupts your system),
1096-
// but if it happens, we just fill the fingerprint with ones which
1097-
// will flag all possible keys as 'possible', ensuring a correct result.
1098-
memset(filter->fingerprints, ~0, 3 * filter->blockLength * sizeof(uint16_t));
1118+
// the cosmic-ray probability (i.e., a cosmic ray corrupts your system).
10991119
free(sets);
11001120
free(Q);
11011121
free(stack);
1102-
return true;
1122+
return false;
11031123
}
11041124

11051125
memset(sets, 0, sizeof(xor_xorset_t) * arrayLength);

0 commit comments

Comments
 (0)