Skip to content

Commit 97f9614

Browse files
committed
Fix issue that not the full match is used to estimate the non-prefix regex matches
1 parent 28b4763 commit 97f9614

4 files changed

Lines changed: 66 additions & 10 deletions

File tree

core/src/annostorage/inmemory.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -778,7 +778,7 @@ where
778778
} else {
779779
// For regular expressions without a prefix the worst case would be `.*[X].*` where `[X]` are the most common characters.
780780
// Sample values from the histogram to get a better estimation of how many percent of the actual values could match.
781-
if let Ok(pattern) = regex::Regex::new(pattern) {
781+
if let Ok(pattern) = regex::Regex::new(&full_match_pattern) {
782782
let mut rng = thread_rng();
783783
let qualified_keys: Vec<_> = match ns {
784784
Some(ns) => vec![AnnoKey {

core/src/annostorage/inmemory/tests.rs

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -171,10 +171,14 @@ fn get_node_id_from_name() {
171171
assert_eq!(false, a.has_node_name("somenode").unwrap());
172172
}
173173

174-
#[test]
175-
fn regex_search() {
174+
/// Inserts the following strings as node names:
175+
/// - _ABC
176+
/// - AAA
177+
/// - AAB
178+
/// - AAC
179+
/// - B
180+
fn insert_test_strings(a: &mut AnnoStorageImpl<NodeID>) {
176181
let key = NODE_NAME_KEY.as_ref().clone();
177-
let mut a: AnnoStorageImpl<NodeID> = AnnoStorageImpl::new();
178182
a.insert(
179183
0,
180184
Annotation {
@@ -216,6 +220,12 @@ fn regex_search() {
216220
},
217221
)
218222
.unwrap();
223+
}
224+
225+
#[test]
226+
fn regex_search() {
227+
let mut a: AnnoStorageImpl<NodeID> = AnnoStorageImpl::new();
228+
insert_test_strings(&mut a);
219229

220230
let result: Result<Vec<_>> = a
221231
.regex_anno_search(Some(ANNIS_NS), NODE_NAME, "A.*", false)
@@ -261,3 +271,20 @@ fn regex_search() {
261271

262272
assert_eq!(0, result.len());
263273
}
274+
275+
#[test]
276+
fn regex_estimation_without_prefix() {
277+
let mut a: AnnoStorageImpl<NodeID> = AnnoStorageImpl::new();
278+
insert_test_strings(&mut a);
279+
a.calculate_statistics().unwrap();
280+
281+
// Test with namespace
282+
// Since there are very less than 250 items the histogram based statistics should be exact.
283+
assert_eq!(
284+
3,
285+
a.guess_max_count_regex(Some(ANNIS_NS), NODE_NAME, ".A.")
286+
.unwrap()
287+
);
288+
// Test without namespace
289+
assert_eq!(3, a.guess_max_count_regex(None, NODE_NAME, ".A.").unwrap());
290+
}

core/src/annostorage/ondisk.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -887,7 +887,8 @@ where
887887

888888
if sum_histogram_buckets > 0 {
889889
let selectivity: f64 = (count_matches as f64) / (sum_histogram_buckets as f64);
890-
Ok((selectivity * (universe_size as f64)).round() as usize)
890+
let estimation = (selectivity * (universe_size as f64)).round() as usize;
891+
Ok(estimation)
891892
} else {
892893
Ok(0)
893894
}
@@ -921,7 +922,7 @@ where
921922
} else {
922923
// For regular expressions without a prefix the worst case would be `.*[X].*` where `[X]` are the most common characters.
923924
// Sample values from the histogram to get a better estimation of how many percent of the actual values could match.
924-
if let Ok(pattern) = regex::Regex::new(pattern) {
925+
if let Ok(pattern) = regex::Regex::new(&full_match_pattern) {
925926
let mut rng = thread_rng();
926927
let qualified_keys = match ns {
927928
Some(ns) => vec![AnnoKey {
@@ -939,6 +940,7 @@ where
939940
if let Some(histo) = self.histogram_bounds.get(&anno_key) {
940941
if !histo.is_empty() {
941942
let sampled_values = histo.iter().choose_multiple(&mut rng, 20);
943+
942944
let matches = sampled_values
943945
.iter()
944946
.filter(|v| pattern.is_match(v))

core/src/annostorage/ondisk/tests.rs

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -180,10 +180,14 @@ fn get_node_id_from_name() {
180180
assert_eq!(false, a.has_node_name("somenode").unwrap());
181181
}
182182

183-
#[test]
184-
fn regex_search() {
183+
/// Inserts the following strings as node names:
184+
/// - _ABC
185+
/// - AAA
186+
/// - AAB
187+
/// - AAC
188+
/// - B
189+
fn insert_test_strings(a: &mut AnnoStorageImpl<NodeID>) {
185190
let key = NODE_NAME_KEY.as_ref().clone();
186-
let mut a: AnnoStorageImpl<NodeID> = AnnoStorageImpl::new(None).unwrap();
187191
a.insert(
188192
0,
189193
Annotation {
@@ -225,6 +229,12 @@ fn regex_search() {
225229
},
226230
)
227231
.unwrap();
232+
}
233+
234+
#[test]
235+
fn regex_search() {
236+
let mut a: AnnoStorageImpl<NodeID> = AnnoStorageImpl::new(None).unwrap();
237+
insert_test_strings(&mut a);
228238

229239
// Test with namespace
230240
let result: Result<Vec<_>> = a
@@ -270,7 +280,7 @@ fn regex_search() {
270280
let result = result.unwrap();
271281
assert_eq!(0, result.len());
272282

273-
// lso test without namepsace
283+
// also test without namepsace
274284
let result: Result<Vec<_>> = a
275285
.regex_anno_search(None, NODE_NAME, "A.*", false)
276286
.map_ok(|m| m.node)
@@ -297,3 +307,20 @@ fn regex_search() {
297307
result.sort();
298308
assert_eq!(vec![0, 4], result);
299309
}
310+
311+
#[test]
312+
fn regex_estimation_without_prefix() {
313+
let mut a: AnnoStorageImpl<NodeID> = AnnoStorageImpl::new(None).unwrap();
314+
insert_test_strings(&mut a);
315+
a.calculate_statistics().unwrap();
316+
317+
// Test with namespace
318+
// Since there are very less than 250 items the histogram based statistics should be exact.
319+
assert_eq!(
320+
3,
321+
a.guess_max_count_regex(Some(ANNIS_NS), NODE_NAME, ".A.")
322+
.unwrap()
323+
);
324+
// Test without namespace
325+
assert_eq!(3, a.guess_max_count_regex(None, NODE_NAME, ".A.").unwrap());
326+
}

0 commit comments

Comments
 (0)