Skip to content

Commit b485d67

Browse files
authored
Fixed/improved filter statistics.
Fixed the FPP calculation, added the a-priori FPP and ISEP, removed flotation.
1 parent 85a94fe commit b485d67

1 file changed

Lines changed: 138 additions & 37 deletions

File tree

sbf.py

Lines changed: 138 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -546,8 +546,9 @@ def print_filter(self, mode, precision = 5):
546546
print('Filter details:')
547547
print(' - Number of cells: ' + str(self.num_cells))
548548
print(' - Size in bytes: ' + str(self.cell_size * self.num_cells))
549-
print(' - Filter sparsity: ' + str(round(self.filter_sparsity(), self.precision)))
550-
print(' - Filter fpp: ' + str(round(self.filter_fpp(), self.precision)))
549+
print(' - Filter sparsity: ' + str('{:.{prec}f}'.format(round(self.filter_sparsity(), self.precision), prec=self.precision)))
550+
print(' - Filter a-priori fpp: ' + str('{:.{prec}f}'.format(round(self.filter_apriori_fpp(), self.precision), prec=self.precision)))
551+
print(' - Filter fpp: ' + str('{:.{prec}f}'.format(round(self.filter_fpp(), self.precision), prec=self.precision)))
551552
print(' - Number of mapped elements: ' + str(self.members))
552553
print(' - Number of hash collisions: ' + str(self.collisions))
553554

@@ -566,15 +567,22 @@ def print_filter(self, mode, precision = 5):
566567
print('----------------')
567568
for self.j in range(1, self.num_areas + 1):
568569
self.potential_elements = (self.area_members[self.j] * self.num_hashes) - self.area_self_collisions[self.j]
569-
print('Area ' + str(self.j).rjust(3) + ': ' + str(self.area_members[self.j]) + ' members, ' + str(self.area_cells[self.j]) + ' cells out of ' + str(self.potential_elements) + ' potential (' + str(self.area_self_collisions[self.j]) + ' self-collisions)')
570+
print('Area ' + str(self.j).rjust(len(str(self.num_areas))) + ': ' \
571+
+ str(self.area_members[self.j]) + ' members, ' \
572+
+ str(self.area_cells[self.j]) + ' cells out of ' \
573+
+ str(self.potential_elements) + ' potential (' \
574+
+ str(self.area_self_collisions[self.j]) + ' self-collisions)')
570575

571-
print('\nEmersion and Fpp:\n')
576+
print('\nEmersion, FPP and ISEP:\n')
572577
self.compute_area_fpp()
578+
self.compute_apriori_area_fpp()
579+
self.compute_apriori_area_isep()
573580
for self.j in range(1, self.num_areas + 1):
574-
if (self.area_flotation(self.j)):
575-
print('Area ' + str(self.j) + ': emersion ' + str(round(self.area_emersion(self.j), self.precision)) + ', flotation safe, fpp ' + str(round(self.area_fpp[self.j], self.precision)))
576-
else:
577-
print('Area ' + str(self.j) + ': emersion ' + str(round(self.area_emersion(self.j), self.precision)) + ', flotation unsafe, fpp ' + str(round(self.area_fpp[self.j], self.precision)))
581+
print('Area ' + str(self.j).rjust(len(str(self.num_areas))) + \
582+
': emersion ' + str('{:.{prec}f}'.format(round(self.area_emersion(self.j), self.precision), prec=self.precision)) + \
583+
', a-priori fpp ' + str('{:.{prec}f}'.format(round(self.area_apriori_fpp[self.j], self.precision), prec=self.precision)) + \
584+
', fpp ' + str('{:.{prec}f}'.format(round(self.area_fpp[self.j], self.precision), prec=self.precision)) + \
585+
', a-priori isep ' + str('{:.{prec}f}'.format(round(self.area_apriori_isep[self.j], self.precision), prec=self.precision)))
578586

579587
del self.j
580588
del self.mode
@@ -584,13 +592,13 @@ def print_filter(self, mode, precision = 5):
584592
def save_filter(self, filter_path, mode, precision = 5):
585593
""" Saves the filter and related statistics onto a CSV file.
586594
587-
Saves to disk the filter and related statistics (according to the specified
595+
Saves to disk the filter or its statistics (according to the specified
588596
operation mode) to the specified path.
589597
590598
Args:
591599
filter_path: the path to the file where to store the filter
592600
information.
593-
mode: If 0, writes the SBF metadata only (CSV: key,value);
601+
mode: If 0, writes the SBF metadata (CSV: key,value);
594602
if 1, writes the SBF cells (CSV: value).
595603
precision: Sets the precision (number of decimal places) to use
596604
when printing float values.
@@ -619,14 +627,24 @@ def save_filter(self, filter_path, mode, precision = 5):
619627
self.filter_file.write("byte_size" + ";" + str(self.cell_size*self.num_cells) + "\n")
620628
self.filter_file.write("members" + ";" + str(self.members) + "\n")
621629
self.filter_file.write("collisions" + ";" + str(self.collisions) + "\n")
622-
self.filter_file.write("sparsity" + ";" + str(round(self.filter_sparsity(), self.precision)) + "\n")
623-
self.filter_file.write("fpp" + ";" + str(round(self.filter_fpp(), self.precision)) + "\n")
630+
self.filter_file.write("sparsity" + ";" + str('{:.{prec}f}'.format(round(self.filter_sparsity(), self.precision), prec=self.precision)) + "\n")
631+
self.filter_file.write("a-priori fpp" + ";" + str('{:.{prec}f}'.format(round(self.filter_apriori_fpp(), self.precision), prec=self.precision)) + "\n")
632+
self.filter_file.write("fpp" + ";" + str('{:.{prec}f}'.format(round(self.filter_fpp(), self.precision), prec=self.precision)) + "\n")
624633

625634
# area-related parameters:
626-
# area,members,self-collisions,cells,emersion,flotation,fpp
635+
# area, members, self-collisions, cells, emersion, apriori_fpp, fpp, apriori_isep
627636
self.compute_area_fpp()
637+
self.compute_apriori_area_fpp()
638+
self.compute_apriori_area_isep()
628639
for self.j in range(1, self.num_areas+1):
629-
self.filter_file.write(str(self.j) + ";" + str(self.area_members[self.j]) + ";" + str(self.area_self_collisions[self.j]) + ";" + str(self.area_cells[self.j]) + ";" + str(round(self.area_emersion(self.j), self.precision)) + ";" + str(self.area_flotation(self.j)) + ";" + str(round(self.area_fpp[self.j], self.precision)) + "\n")
640+
self.filter_file.write(str(self.j) + ";" + \
641+
str(self.area_members[self.j]) + ";" + \
642+
str(self.area_self_collisions[self.j]) + ";" + \
643+
str(self.area_cells[self.j]) + ";" + \
644+
str('{:.{prec}f}'.format(round(self.area_emersion(self.j), self.precision), prec=self.precision)) + ";" + \
645+
str('{:.{prec}f}'.format(round(self.area_apriori_fpp[self.j], self.precision), prec=self.precision)) + ";" + \
646+
str('{:.{prec}f}'.format(round(self.area_fpp[self.j], self.precision), prec=self.precision)) + ";" + \
647+
str('{:.{prec}f}'.format(round(self.area_apriori_isep[self.j], self.precision), prec=self.precision)) + "\n")
630648

631649
del self.j
632650

@@ -659,7 +677,7 @@ def compute_area_fpp(self):
659677
the filter statistics).
660678
661679
Returns:
662-
The list of false positives probability (fpp) for the areas.
680+
The list of a-posteriori false positives probability (fpp) for the areas.
663681
"""
664682

665683
self.area_fpp = [0]*(self.num_areas + 1)
@@ -688,6 +706,90 @@ def compute_area_fpp(self):
688706
return self.area_fpp
689707

690708

709+
def compute_apriori_area_fpp(self):
710+
""" Computes a-priori false positives probability for each area.
711+
712+
Computes the a-priori false positives probability (fpp) for each
713+
area. This method needs to be called manually after the last insert
714+
operation in order to generate the statistics correctly. Alternatively,
715+
it is called by both the print_filter and the save_filter (when saving
716+
the filter statistics).
717+
718+
Returns:
719+
The list of a-priori false positives probability (fpp) for the areas.
720+
"""
721+
722+
self.area_apriori_fpp = [0]*(self.num_areas + 1)
723+
724+
for self.i in range(self.num_areas, 0, -1):
725+
726+
self.c = 0
727+
self.p = 0
728+
729+
for self.j in range(self.i, self.num_areas+1):
730+
self.c += self.area_members[self.j]
731+
732+
self.p = 1 - (1 / self.num_cells)
733+
734+
self.p = 1 - pow(self.p, (self.num_hashes * self.c))
735+
736+
self.p = pow(self.p, self.num_hashes)
737+
738+
self.area_apriori_fpp[self.i] = self.p
739+
740+
for self.j in range(self.i, self.num_areas):
741+
self.area_apriori_fpp[self.i] -= self.area_apriori_fpp[self.j + 1]
742+
743+
if (self.area_apriori_fpp[self.i] < 0):
744+
self.area_apriori_fpp[self.i] = 0
745+
746+
del self.j
747+
del self.c
748+
del self.p
749+
del self.i
750+
751+
return self.area_apriori_fpp
752+
753+
754+
def compute_apriori_area_isep(self):
755+
""" Computes a-priori inter-set error probability for each area.
756+
757+
Computes the a-priori inter-set error probability (isep) for each
758+
area. This method needs to be called manually after the last insert
759+
operation in order to generate the statistics correctly. Alternatively,
760+
it is called by both the print_filter and the save_filter (when saving
761+
the filter statistics).
762+
763+
Returns:
764+
The list of a-priori inter-set error probability (isep) for the areas.
765+
"""
766+
767+
self.area_apriori_isep = [0]*(self.num_areas + 1)
768+
769+
for self.i in range(self.num_areas, 0, -1):
770+
771+
self.nfill = 0
772+
self.p = 0
773+
774+
for self.j in range(self.i+1, self.num_areas+1):
775+
self.nfill += self.area_members[self.j]
776+
777+
self.p = 1 - (1 / self.num_cells)
778+
779+
self.p = 1 - pow(self.p, (self.num_hashes * self.nfill))
780+
781+
self.p = pow(self.p, self.num_hashes)
782+
783+
self.area_apriori_isep[self.i] = self.p
784+
785+
del self.nfill
786+
del self.p
787+
del self.i
788+
del self.j
789+
790+
return self.area_apriori_isep
791+
792+
691793
def filter_sparsity(self):
692794
""" Returns the sparsity of the SBF.
693795
@@ -713,10 +815,10 @@ def filter_fpp(self):
713815
filter (i.e. not area-specific).
714816
715817
Returns:
716-
The filter false positives probability (fpp).
818+
The filter a-posteriori false positives probability (fpp).
717819
"""
718820

719-
self.c =0
821+
self.c = 0
720822

721823
# Counts non-zero cells
722824
for self.i in range(1, self.num_areas + 1):
@@ -727,6 +829,25 @@ def filter_fpp(self):
727829
return pow(self.p, self.num_hashes)
728830

729831

832+
def filter_apriori_fpp(self):
833+
""" Computes a-priori false positives probability for the filter.
834+
835+
Computes the a-priori false positive probability over the entire
836+
filter (i.e. not area-specific).
837+
838+
Returns:
839+
The filter a-priori false positives probability (fpp).
840+
"""
841+
842+
self.p = 1 - (1 / self.num_cells)
843+
844+
self.p = 1 - pow(self.p, (self.num_hashes * self.members))
845+
846+
self.p = pow(self.p, self.num_hashes)
847+
848+
return self.p
849+
850+
730851
def area_emersion(self, area):
731852
""" Computes the emersion value for an area.
732853
@@ -750,23 +871,3 @@ def area_emersion(self, area):
750871
return -1
751872
else:
752873
return (self.area_cells[self.area] / ((self.area_members[self.area] * self.num_hashes) - self.area_self_collisions[self.area]))
753-
754-
755-
def area_flotation(self, area):
756-
""" Computes the flotation value for an area.
757-
758-
Computes the flotation value for the input area. The flotation is True if
759-
it is not possible for an element belonging to the area to be recognized
760-
as belonging to a different area, False if collisions may cause this to
761-
happen.
762-
763-
Returns:
764-
The flotation value (boolean).
765-
"""
766-
767-
self.area = area
768-
769-
if (self.area_members[self.area] == 0):
770-
return True
771-
else:
772-
return (self.area_members[self.area] * self.num_hashes) - self.area_self_collisions[self.area] - self.area_cells[self.area] < self.num_hashes

0 commit comments

Comments
 (0)