Skip to content

Commit 8dfc64f

Browse files
authored
Merge pull request #325 from korpling/better-regex-estimation
Better estimation for queries with regular expressions without prefix
2 parents afe7bde + 97f9614 commit 8dfc64f

8 files changed

Lines changed: 261 additions & 84 deletions

File tree

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1010
- Avoid loading the node annotation storage when listing the components for a
1111
corpus in the `CorpusStorage`. Before this change, querying for components via
1212
the webservice could block the corpus cache.
13+
- Better estimation for queries with regular expressions without prefix.
14+
1315

1416
## [3.8.0] - 2025-05-14
1517

core/src/annostorage/inmemory.rs

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ use crate::util::{self};
77
use crate::{annostorage::symboltable::SymbolTable, errors::GraphAnnisCoreError};
88
use core::ops::Bound::*;
99
use itertools::Itertools;
10+
use rand::seq::IteratorRandom;
11+
use rand::thread_rng;
1012
use rustc_hash::FxHashSet;
1113
use smartstring::alias::String;
1214
use smartstring::{LazyCompact, SmartString};
@@ -775,9 +777,50 @@ where
775777
}
776778
} else {
777779
// For regular expressions without a prefix the worst case would be `.*[X].*` where `[X]` are the most common characters.
778-
// Assume that a generic percentage (here 5%) of all nodes match the regex.
779-
// TODO: find better ways of estimating this constant
780-
guessed_count = (0.05 * (total as f64)) as usize;
780+
// Sample values from the histogram to get a better estimation of how many percent of the actual values could match.
781+
if let Ok(pattern) = regex::Regex::new(&full_match_pattern) {
782+
let mut rng = thread_rng();
783+
let qualified_keys: Vec<_> = match ns {
784+
Some(ns) => vec![AnnoKey {
785+
name: name.into(),
786+
ns: ns.into(),
787+
}],
788+
None => self.get_qnames(name)?,
789+
}
790+
.into_iter()
791+
.filter_map(|key| self.anno_keys.get_symbol(&key).map(|symbol| (symbol, key)))
792+
.collect();
793+
for (anno_key_symbol, anno_key) in qualified_keys {
794+
let anno_size = self
795+
.anno_key_sizes
796+
.get(&anno_key)
797+
.copied()
798+
.unwrap_or_default();
799+
800+
if let Some(histo) = self.histogram_bounds.get(&anno_key_symbol) {
801+
if !histo.is_empty() {
802+
let sampled_values = histo.iter().choose_multiple(&mut rng, 20);
803+
let matches = sampled_values
804+
.iter()
805+
.filter(|v| pattern.is_match(v))
806+
.count();
807+
if sampled_values.len() == matches {
808+
// Assume all values match
809+
guessed_count += anno_size;
810+
} else if matches == 0 {
811+
// No match found, but use the bucket size as pessimistic guess
812+
guessed_count +=
813+
(anno_size as f64 / sampled_values.len() as f64) as usize;
814+
} else {
815+
// Use the percent of matched values to guess the overall number
816+
let match_ratio =
817+
(matches as f64) / (sampled_values.len() as f64);
818+
guessed_count += ((anno_size as f64) * match_ratio) as usize;
819+
}
820+
}
821+
}
822+
}
823+
}
781824
}
782825

783826
Ok(guessed_count.min(total))

core/src/annostorage/inmemory/tests.rs

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -171,10 +171,14 @@ fn get_node_id_from_name() {
171171
assert_eq!(false, a.has_node_name("somenode").unwrap());
172172
}
173173

174-
#[test]
175-
fn regex_search() {
174+
/// Inserts the following strings as node names:
175+
/// - _ABC
176+
/// - AAA
177+
/// - AAB
178+
/// - AAC
179+
/// - B
180+
fn insert_test_strings(a: &mut AnnoStorageImpl<NodeID>) {
176181
let key = NODE_NAME_KEY.as_ref().clone();
177-
let mut a: AnnoStorageImpl<NodeID> = AnnoStorageImpl::new();
178182
a.insert(
179183
0,
180184
Annotation {
@@ -216,6 +220,12 @@ fn regex_search() {
216220
},
217221
)
218222
.unwrap();
223+
}
224+
225+
#[test]
226+
fn regex_search() {
227+
let mut a: AnnoStorageImpl<NodeID> = AnnoStorageImpl::new();
228+
insert_test_strings(&mut a);
219229

220230
let result: Result<Vec<_>> = a
221231
.regex_anno_search(Some(ANNIS_NS), NODE_NAME, "A.*", false)
@@ -261,3 +271,20 @@ fn regex_search() {
261271

262272
assert_eq!(0, result.len());
263273
}
274+
275+
#[test]
276+
fn regex_estimation_without_prefix() {
277+
let mut a: AnnoStorageImpl<NodeID> = AnnoStorageImpl::new();
278+
insert_test_strings(&mut a);
279+
a.calculate_statistics().unwrap();
280+
281+
// Test with namespace
282+
// Since there are very less than 250 items the histogram based statistics should be exact.
283+
assert_eq!(
284+
3,
285+
a.guess_max_count_regex(Some(ANNIS_NS), NODE_NAME, ".A.")
286+
.unwrap()
287+
);
288+
// Test without namespace
289+
assert_eq!(3, a.guess_max_count_regex(None, NODE_NAME, ".A.").unwrap());
290+
}

core/src/annostorage/ondisk.rs

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ use crate::{try_as_boxed_iter, util};
1010
use core::ops::Bound::*;
1111
use itertools::Itertools;
1212
use rand::seq::IteratorRandom;
13+
use rand::thread_rng;
1314
use regex_syntax::hir::literal::Seq;
1415
use regex_syntax::Parser;
1516
use serde_bytes::ByteBuf;
@@ -886,7 +887,8 @@ where
886887

887888
if sum_histogram_buckets > 0 {
888889
let selectivity: f64 = (count_matches as f64) / (sum_histogram_buckets as f64);
889-
Ok((selectivity * (universe_size as f64)).round() as usize)
890+
let estimation = (selectivity * (universe_size as f64)).round() as usize;
891+
Ok(estimation)
890892
} else {
891893
Ok(0)
892894
}
@@ -919,9 +921,47 @@ where
919921
}
920922
} else {
921923
// For regular expressions without a prefix the worst case would be `.*[X].*` where `[X]` are the most common characters.
922-
// Assume that a generic percentage (here 5%) of all nodes match the regex.
923-
// TODO: find better ways of estimating this constant
924-
guessed_count = (0.05 * (total as f64)) as usize;
924+
// Sample values from the histogram to get a better estimation of how many percent of the actual values could match.
925+
if let Ok(pattern) = regex::Regex::new(&full_match_pattern) {
926+
let mut rng = thread_rng();
927+
let qualified_keys = match ns {
928+
Some(ns) => vec![AnnoKey {
929+
name: name.into(),
930+
ns: ns.into(),
931+
}],
932+
None => self.get_qnames(name)?,
933+
};
934+
for anno_key in qualified_keys {
935+
let anno_size = self
936+
.anno_key_sizes
937+
.get(&anno_key)
938+
.copied()
939+
.unwrap_or_default();
940+
if let Some(histo) = self.histogram_bounds.get(&anno_key) {
941+
if !histo.is_empty() {
942+
let sampled_values = histo.iter().choose_multiple(&mut rng, 20);
943+
944+
let matches = sampled_values
945+
.iter()
946+
.filter(|v| pattern.is_match(v))
947+
.count();
948+
if sampled_values.len() == matches {
949+
// Assume all values match
950+
guessed_count += anno_size;
951+
} else if matches == 0 {
952+
// No match found, but use the bucket size as pessimistic guess
953+
guessed_count +=
954+
(anno_size as f64 / sampled_values.len() as f64) as usize;
955+
} else {
956+
// Use the percent of matched values to guess the overall number
957+
let match_ratio =
958+
(matches as f64) / (sampled_values.len() as f64);
959+
guessed_count += ((anno_size as f64) * match_ratio) as usize;
960+
}
961+
}
962+
}
963+
}
964+
}
925965
}
926966

927967
Ok(guessed_count.min(total))

core/src/annostorage/ondisk/tests.rs

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -180,10 +180,14 @@ fn get_node_id_from_name() {
180180
assert_eq!(false, a.has_node_name("somenode").unwrap());
181181
}
182182

183-
#[test]
184-
fn regex_search() {
183+
/// Inserts the following strings as node names:
184+
/// - _ABC
185+
/// - AAA
186+
/// - AAB
187+
/// - AAC
188+
/// - B
189+
fn insert_test_strings(a: &mut AnnoStorageImpl<NodeID>) {
185190
let key = NODE_NAME_KEY.as_ref().clone();
186-
let mut a: AnnoStorageImpl<NodeID> = AnnoStorageImpl::new(None).unwrap();
187191
a.insert(
188192
0,
189193
Annotation {
@@ -225,6 +229,12 @@ fn regex_search() {
225229
},
226230
)
227231
.unwrap();
232+
}
233+
234+
#[test]
235+
fn regex_search() {
236+
let mut a: AnnoStorageImpl<NodeID> = AnnoStorageImpl::new(None).unwrap();
237+
insert_test_strings(&mut a);
228238

229239
// Test with namespace
230240
let result: Result<Vec<_>> = a
@@ -270,7 +280,7 @@ fn regex_search() {
270280
let result = result.unwrap();
271281
assert_eq!(0, result.len());
272282

273-
// lso test without namepsace
283+
// also test without namepsace
274284
let result: Result<Vec<_>> = a
275285
.regex_anno_search(None, NODE_NAME, "A.*", false)
276286
.map_ok(|m| m.node)
@@ -297,3 +307,20 @@ fn regex_search() {
297307
result.sort();
298308
assert_eq!(vec![0, 4], result);
299309
}
310+
311+
#[test]
312+
fn regex_estimation_without_prefix() {
313+
let mut a: AnnoStorageImpl<NodeID> = AnnoStorageImpl::new(None).unwrap();
314+
insert_test_strings(&mut a);
315+
a.calculate_statistics().unwrap();
316+
317+
// Test with namespace
318+
// Since there are very less than 250 items the histogram based statistics should be exact.
319+
assert_eq!(
320+
3,
321+
a.guess_max_count_regex(Some(ANNIS_NS), NODE_NAME, ".A.")
322+
.unwrap()
323+
);
324+
// Test without namespace
325+
assert_eq!(3, a.guess_max_count_regex(None, NODE_NAME, ".A.").unwrap());
326+
}

0 commit comments

Comments
 (0)