@@ -10,6 +10,7 @@ use crate::{try_as_boxed_iter, util};
1010use core:: ops:: Bound :: * ;
1111use itertools:: Itertools ;
1212use rand:: seq:: IteratorRandom ;
13+ use rand:: thread_rng;
1314use regex_syntax:: hir:: literal:: Seq ;
1415use regex_syntax:: Parser ;
1516use serde_bytes:: ByteBuf ;
@@ -886,7 +887,8 @@ where
886887
887888 if sum_histogram_buckets > 0 {
888889 let selectivity: f64 = ( count_matches as f64 ) / ( sum_histogram_buckets as f64 ) ;
889- Ok ( ( selectivity * ( universe_size as f64 ) ) . round ( ) as usize )
890+ let estimation = ( selectivity * ( universe_size as f64 ) ) . round ( ) as usize ;
891+ Ok ( estimation)
890892 } else {
891893 Ok ( 0 )
892894 }
@@ -919,9 +921,47 @@ where
919921 }
920922 } else {
921923 // For regular expressions without a prefix the worst case would be `.*[X].*` where `[X]` are the most common characters.
922- // Assume that a generic percentage (here 5%) of all nodes match the regex.
923- // TODO: find better ways of estimating this constant
924- guessed_count = ( 0.05 * ( total as f64 ) ) as usize ;
924+ // Sample values from the histogram to get a better estimation of how many percent of the actual values could match.
925+ if let Ok ( pattern) = regex:: Regex :: new ( & full_match_pattern) {
926+ let mut rng = thread_rng ( ) ;
927+ let qualified_keys = match ns {
928+ Some ( ns) => vec ! [ AnnoKey {
929+ name: name. into( ) ,
930+ ns: ns. into( ) ,
931+ } ] ,
932+ None => self . get_qnames ( name) ?,
933+ } ;
934+ for anno_key in qualified_keys {
935+ let anno_size = self
936+ . anno_key_sizes
937+ . get ( & anno_key)
938+ . copied ( )
939+ . unwrap_or_default ( ) ;
940+ if let Some ( histo) = self . histogram_bounds . get ( & anno_key) {
941+ if !histo. is_empty ( ) {
942+ let sampled_values = histo. iter ( ) . choose_multiple ( & mut rng, 20 ) ;
943+
944+ let matches = sampled_values
945+ . iter ( )
946+ . filter ( |v| pattern. is_match ( v) )
947+ . count ( ) ;
948+ if sampled_values. len ( ) == matches {
949+ // Assume all values match
950+ guessed_count += anno_size;
951+ } else if matches == 0 {
952+ // No match found, but use the bucket size as pessimistic guess
953+ guessed_count +=
954+ ( anno_size as f64 / sampled_values. len ( ) as f64 ) as usize ;
955+ } else {
956+ // Use the percent of matched values to guess the overall number
957+ let match_ratio =
958+ ( matches as f64 ) / ( sampled_values. len ( ) as f64 ) ;
959+ guessed_count += ( ( anno_size as f64 ) * match_ratio) as usize ;
960+ }
961+ }
962+ }
963+ }
964+ }
925965 }
926966
927967 Ok ( guessed_count. min ( total) )
0 commit comments