Skip to content

Commit 2f353b5

Browse files
committed
Only estimate 5% of regex without prefix actually do match
1 parent 540a817 commit 2f353b5

3 files changed

Lines changed: 24 additions & 27 deletions

File tree

core/src/annostorage/inmemory.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -720,7 +720,6 @@ where
720720
if let Some(anno_key) = self.anno_keys.get_symbol(&anno_key) {
721721
if let Some(histo) = self.histogram_bounds.get(&anno_key) {
722722
// find the range in which the value is contained
723-
724723
// we need to make sure the histogram is not empty -> should have at least two bounds
725724
if histo.len() >= 2 {
726725
sum_histogram_buckets += histo.len() - 1;
@@ -775,7 +774,10 @@ where
775774
}
776775
}
777776
} else {
778-
guessed_count = total;
777+
// For regular expressions without a prefix the worst case would be `.*[X].*` where `[X]` are the most common characters.
778+
// Assume that a generic percentage (here 5%) of all nodes match the regex.
779+
// TODO: find better ways of estimating this constant
780+
guessed_count = (0.05 * (total as f64)) as usize;
779781
}
780782

781783
Ok(guessed_count.min(total))

core/src/annostorage/ondisk.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -918,7 +918,10 @@ where
918918
}
919919
}
920920
} else {
921-
guessed_count = total;
921+
// For regular expressions without a prefix the worst case would be `.*[X].*` where `[X]` are the most common characters.
922+
// Assume that a generic percentage (here 5%) of all nodes match the regex.
923+
// TODO: find better ways of estimating this constant
924+
guessed_count = (0.05 * (total as f64)) as usize;
922925
}
923926

924927
Ok(guessed_count.min(total))

graphannis/src/annis/db/aql/operators/edge_op.rs

Lines changed: 16 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -44,38 +44,30 @@ impl BaseEdgeOp {
4444

4545
gs.push(gs_for_component);
4646
}
47-
let any_node_count = db.get_node_annos().guess_max_count(
48-
Some(&NODE_TYPE_KEY.ns),
49-
&NODE_TYPE_KEY.name,
50-
"",
51-
&char::MAX.to_string(),
52-
)?;
47+
5348
let all_part_of_components = spec
5449
.components
5550
.iter()
5651
.all(|c| c.get_type() == AnnotationComponentType::PartOf);
5752
let max_nodes_estimate = if all_part_of_components {
5853
// PartOf components have a very skewed distribution of root nodes
5954
// vs. the actual possible targets, thus do not use all nodes as
60-
// population but only the non-roots.
61-
if gs.len() == 1 {
62-
gs[0]
63-
.get_statistics()
64-
.map(|s| s.nodes.saturating_sub(s.root_nodes))
65-
.unwrap_or(any_node_count)
66-
} else {
67-
// If multiple PartOf graph storages are combined, we can guess
68-
// the non-root nodes by estimating the number of nodes in the
69-
// corpus grah.
70-
db.get_node_annos().guess_max_count(
71-
Some(&NODE_TYPE_KEY.ns),
72-
&NODE_TYPE_KEY.name,
73-
"corpus",
74-
"text",
75-
)?
76-
}
55+
// population but only the non-roots. We can guess the non-root
56+
// nodes by estimating the number of nodes in the corpus grah.
57+
let result = db.get_node_annos().guess_max_count(
58+
Some(&NODE_TYPE_KEY.ns),
59+
&NODE_TYPE_KEY.name,
60+
"corpus",
61+
"datasource",
62+
)?;
63+
result
7764
} else {
78-
any_node_count
65+
db.get_node_annos().guess_max_count(
66+
Some(&NODE_TYPE_KEY.ns),
67+
&NODE_TYPE_KEY.name,
68+
"node",
69+
"node",
70+
)?
7971
};
8072
Ok(BaseEdgeOp {
8173
gs,

0 commit comments

Comments
 (0)