Skip to content

Commit 5181b74

Browse files
authored
Merge pull request #332 from korpling/fix-inverse-partof-estimate
Fix inverse PartOf estimate
2 parents 189a16a + 822054f commit 5181b74

8 files changed

Lines changed: 171 additions & 53 deletions

File tree

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1010
- `execute_query_on_graph` now skips nodes that are not part of the output
1111
(optional negated nodes) and makes sure the resulting iterator only produces
1212
unique results.
13+
- Queries with `@` could have extremly slow execution plans when the query
14+
planner introduces an inverted `@` operator and miscalculated the cost
15+
compared to the non-inverted version.
16+
- Frequency queries now execute the (additional) timeout check after a certain
17+
number of matches are processed, not if a specific tuple value has reached a
18+
treshold.
1319

1420
## [4.0.0] - 2025-08-20
1521

core/src/annostorage/inmemory.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@ pub struct AnnoStorageImpl<T: Ord + Hash + Default> {
3333
anno_keys: SymbolTable<AnnoKey>,
3434
anno_values: SymbolTable<String>,
3535

36-
/// additional statistical information
36+
/// Sampled histograms for each annotation key .
37+
/// Each histogram bound defines a range of values where we estimate that they have the same number of occurences.
3738
histogram_bounds: BTreeMap<usize, Vec<String>>,
3839
largest_item: Option<T>,
3940
total_number_of_annos: usize,

core/src/annostorage/ondisk.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,8 @@ where
4848

4949
anno_key_sizes: BTreeMap<AnnoKey, usize>,
5050

51-
/// additional statistical information
51+
/// Sampled histograms for each annotation key .
52+
/// Each histogram bound defines a range of values where we estimate that they have the same number of occurences.
5253
histogram_bounds: BTreeMap<AnnoKey, Vec<String>>,
5354
largest_item: Option<T>,
5455

core/src/util/disk_collections.rs

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,6 @@ const BLOCK_MAX_SIZE: usize = 4 * KB;
2121
/// Uses a cache for each disk table with 8 MB capacity.
2222
pub const DEFAULT_BLOCK_CACHE_CAPACITY: usize = 8 * MB;
2323

24-
#[derive(Serialize, Deserialize)]
25-
struct Entry<K, V>
26-
where
27-
K: Ord,
28-
{
29-
key: K,
30-
value: V,
31-
}
32-
3324
pub enum EvictionStrategy {
3425
MaximumItems(usize),
3526
}

graphannis/src/annis/db/aql/operators/edge_op.rs

Lines changed: 63 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ struct BaseEdgeOpSpec {
2424
pub edge_anno: Option<EdgeAnnoSearchSpec>,
2525
pub is_reflexive: bool,
2626
pub op_str: Option<String>,
27-
pub check_cost_for_inverse_operator: bool,
27+
pub inverse_operator_needs_cost_check: bool,
2828
}
2929

3030
struct BaseEdgeOp {
@@ -45,34 +45,7 @@ impl BaseEdgeOp {
4545
gs.push(gs_for_component);
4646
}
4747

48-
let all_part_of_components = spec
49-
.components
50-
.iter()
51-
.all(|c| c.get_type() == AnnotationComponentType::PartOf);
52-
53-
let max_nodes_estimate = if all_part_of_components && gs.len() == 1 {
54-
// PartOf components have a very skewed distribution of root nodes
55-
// vs. the actual possible targets, thus do not use all nodes as
56-
// population but only the non-roots.
57-
if let Some(stats) = gs[0].get_statistics() {
58-
stats.nodes - stats.root_nodes
59-
} else {
60-
// Fallback to guessing by using the node type
61-
db.get_node_annos().guess_max_count(
62-
Some(&NODE_TYPE_KEY.ns),
63-
&NODE_TYPE_KEY.name,
64-
"corpus",
65-
"datasource",
66-
)?
67-
}
68-
} else {
69-
db.get_node_annos().guess_max_count(
70-
Some(&NODE_TYPE_KEY.ns),
71-
&NODE_TYPE_KEY.name,
72-
"node",
73-
"node",
74-
)?
75-
};
48+
let max_nodes_estimate = calculate_max_node_estimate(db, &spec, &gs, false)?;
7649
Ok(BaseEdgeOp {
7750
gs,
7851
spec,
@@ -82,6 +55,48 @@ impl BaseEdgeOp {
8255
}
8356
}
8457

58+
fn calculate_max_node_estimate(
59+
db: &AnnotationGraph,
60+
spec: &BaseEdgeOpSpec,
61+
gs: &[Arc<dyn GraphStorage>],
62+
inverse: bool,
63+
) -> Result<usize> {
64+
let all_components_are_partof = spec
65+
.components
66+
.iter()
67+
.all(|c| c.get_type() == AnnotationComponentType::PartOf);
68+
let max_nodes_estimate = if all_components_are_partof && gs.len() == 1 {
69+
// PartOf components have a very skewed distribution of root nodes vs.
70+
// the actual possible targets, thus do not use all nodes as population
71+
// but only the non-roots. We can only use this formula for the actual
72+
// @* operator, but not the inverted one.
73+
if !inverse && let Some(stats) = gs[0].get_statistics() {
74+
stats.nodes - stats.root_nodes
75+
} else {
76+
// Fallback to guessing how many nodes have the node type "corpus"
77+
// or "datasource" and thus could be reachable as RHS in a worst case
78+
// scenario. Since a node can't be part of itself, subtract 1 for
79+
// the node on the LHS.
80+
db.get_node_annos()
81+
.guess_max_count(
82+
Some(&NODE_TYPE_KEY.ns),
83+
&NODE_TYPE_KEY.name,
84+
"corpus",
85+
"datasource",
86+
)?
87+
.saturating_sub(1)
88+
}
89+
} else {
90+
db.get_node_annos().guess_max_count(
91+
Some(&NODE_TYPE_KEY.ns),
92+
&NODE_TYPE_KEY.name,
93+
"node",
94+
"node",
95+
)?
96+
};
97+
Ok(max_nodes_estimate)
98+
}
99+
85100
impl BinaryOperatorSpec for BaseEdgeOpSpec {
86101
fn necessary_components(
87102
&self,
@@ -286,13 +301,15 @@ impl BinaryOperatorBase for BaseEdgeOp {
286301

287302
fn get_inverse_operator<'a>(
288303
&self,
289-
_graph: &'a AnnotationGraph,
304+
graph: &'a AnnotationGraph,
290305
) -> Result<Option<BinaryOperator<'a>>> {
306+
let inverse = !self.inverse;
307+
291308
// Check if all graph storages have the same inverse cost. If not, we
292309
// don't provide an inverse operator, because the plans would not
293310
// account for the different costs
294311
for g in &self.gs {
295-
if self.spec.check_cost_for_inverse_operator && !g.inverse_has_same_cost() {
312+
if self.spec.inverse_operator_needs_cost_check && !g.inverse_has_same_cost() {
296313
return Ok(None);
297314
}
298315
if let Some(stat) = g.get_statistics() {
@@ -302,11 +319,12 @@ impl BinaryOperatorBase for BaseEdgeOp {
302319
}
303320
}
304321
}
322+
let max_nodes_estimate = calculate_max_node_estimate(graph, &self.spec, &self.gs, inverse)?;
305323
let edge_op = BaseEdgeOp {
306324
gs: self.gs.clone(),
307325
spec: self.spec.clone(),
308-
max_nodes_estimate: self.max_nodes_estimate,
309-
inverse: !self.inverse,
326+
max_nodes_estimate,
327+
inverse,
310328
};
311329
Ok(Some(BinaryOperator::Index(Box::new(edge_op))))
312330
}
@@ -317,7 +335,11 @@ impl BinaryOperatorBase for BaseEdgeOp {
317335
return Ok(EstimationType::Selectivity(0.0));
318336
}
319337

320-
let max_nodes: f64 = self.max_nodes_estimate as f64;
338+
let mut max_nodes: f64 = self.max_nodes_estimate as f64;
339+
// Avoid division by 0
340+
if max_nodes == 0.0 {
341+
max_nodes = 1.0;
342+
}
321343

322344
let mut worst_sel: f64 = 0.0;
323345

@@ -624,7 +646,7 @@ impl BinaryOperatorSpec for DominanceSpec {
624646
dist: self.dist.clone(),
625647
edge_anno: self.edge_anno.clone(),
626648
is_reflexive: true,
627-
check_cost_for_inverse_operator: true,
649+
inverse_operator_needs_cost_check: true,
628650
};
629651
base.create_operator(db, cost_estimate)
630652
}
@@ -676,7 +698,7 @@ impl BinaryOperatorSpec for PointingSpec {
676698
edge_anno: self.edge_anno.clone(),
677699
is_reflexive: true,
678700
op_str: Some(op_str),
679-
check_cost_for_inverse_operator: true,
701+
inverse_operator_needs_cost_check: true,
680702
};
681703
base.create_operator(db, cost_estimate)
682704
}
@@ -721,7 +743,7 @@ impl BinaryOperatorSpec for PartOfSubCorpusSpec {
721743
ANNIS_NS.into(),
722744
"".into(),
723745
)];
724-
let check_cost_for_inverse_operator = if let Some((_, rhs)) = cost_estimate {
746+
let inverse_operator_needs_cost_check = if let Some((_, rhs)) = cost_estimate {
725747
// Only ignore different cost and risk a nested loop join if the RHS
726748
// has an estimated output size of 1 and thus a nested loop is not
727749
// as costly.
@@ -735,7 +757,7 @@ impl BinaryOperatorSpec for PartOfSubCorpusSpec {
735757
dist: self.dist.clone(),
736758
edge_anno: None,
737759
is_reflexive: false,
738-
check_cost_for_inverse_operator,
760+
inverse_operator_needs_cost_check,
739761
};
740762

741763
base.create_operator(db, cost_estimate)
@@ -751,3 +773,6 @@ impl BinaryOperatorSpec for PartOfSubCorpusSpec {
751773
self
752774
}
753775
}
776+
777+
#[cfg(test)]
778+
mod tests;
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
use graphannis_core::graph::{
2+
ANNIS_NS,
3+
update::{GraphUpdate, UpdateEvent},
4+
};
5+
6+
use crate::{
7+
AnnotationGraph,
8+
annis::{
9+
db::{
10+
aql::{ast::RangeSpec, operators::PartOfSubCorpusSpec},
11+
exec::CostEstimate,
12+
},
13+
operator::{BinaryOperatorBase, BinaryOperatorSpec},
14+
},
15+
};
16+
17+
/// Tests that if you invert a @* operator, the cost estimate stays the same.
18+
#[test]
19+
fn inverted_partof_has_same_estimate() {
20+
// Create a simple annotation graph a chain of PartOf edges, so that the
21+
// fan-out and inverse fan-out of the PartOf component are both 1.
22+
// It has the following nodes, connected by a PartOf edge each
23+
// - root
24+
// - root/c
25+
// - root/c/c
26+
// - root/c/c/c
27+
// - ...
28+
// - root/c/c/c/c/c/c/c/c/c/c
29+
let mut update = GraphUpdate::new();
30+
update
31+
.add_event(UpdateEvent::AddNode {
32+
node_name: "root".to_string(),
33+
node_type: "corpus".to_string(),
34+
})
35+
.unwrap();
36+
for i in 1..10 {
37+
let source_path = format!("root{}", "/c".repeat(i));
38+
let target_path = format!("root{}", "/c".repeat(i - 1));
39+
update
40+
.add_event(UpdateEvent::AddNode {
41+
node_name: source_path.to_string(),
42+
node_type: "corpus".to_string(),
43+
})
44+
.unwrap();
45+
46+
update
47+
.add_event(UpdateEvent::AddEdge {
48+
source_node: source_path.to_string(),
49+
target_node: target_path.to_string(),
50+
layer: ANNIS_NS.to_string(),
51+
component_type: "PartOf".to_string(),
52+
component_name: "".to_string(),
53+
})
54+
.unwrap();
55+
}
56+
57+
let mut g = AnnotationGraph::with_default_graphstorages(false).unwrap();
58+
g.apply_update(&mut update, |_| {}).unwrap();
59+
60+
// Define an operator and a realistic cost estimate for LHS and RHS
61+
let spec = PartOfSubCorpusSpec {
62+
dist: RangeSpec::Unbound,
63+
};
64+
let cost_estimate_lhs = CostEstimate {
65+
output: 1,
66+
intermediate_sum: 0,
67+
processed_in_step: 0,
68+
};
69+
let cost_estimate_rhs = CostEstimate {
70+
output: 1,
71+
intermediate_sum: 0,
72+
processed_in_step: 0,
73+
};
74+
75+
let operator = spec
76+
.create_operator(&g, Some((&cost_estimate_lhs, &cost_estimate_rhs)))
77+
.unwrap();
78+
79+
let orig_estimate = operator.estimation_type().unwrap();
80+
81+
let inverted_operator = operator.get_inverse_operator(&g).unwrap();
82+
assert_eq!(true, inverted_operator.is_some());
83+
let inverted_operator = inverted_operator.unwrap();
84+
85+
let inverted_estimate = inverted_operator.estimation_type().unwrap();
86+
87+
assert_eq!(orig_estimate, inverted_estimate);
88+
}

graphannis/src/annis/db/corpusstorage.rs

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,9 @@ mod subgraph;
6969
#[cfg(test)]
7070
mod tests;
7171

72+
/// After how many produces tuples the timeout check should be manually triggered (in case the underlying joint did not already check the timeout)
73+
const TIMEOUT_CHECK_TUPLE_COUNT: u64 = 1_000;
74+
7275
enum CacheEntry {
7376
Loaded(AnnotationGraph),
7477
NotLoaded,
@@ -1602,7 +1605,7 @@ impl CorpusStorage {
16021605

16031606
for _ in plan {
16041607
total_count += 1;
1605-
if total_count % 1_000 == 0 {
1608+
if total_count.is_multiple_of(TIMEOUT_CHECK_TUPLE_COUNT) {
16061609
timeout.check()?;
16071610
}
16081611
}
@@ -1666,7 +1669,7 @@ impl CorpusStorage {
16661669
}
16671670
match_count += 1;
16681671

1669-
if match_count % 1_000 == 0 {
1672+
if match_count.is_multiple_of(TIMEOUT_CHECK_TUPLE_COUNT) {
16701673
timeout.check()?;
16711674
}
16721675
}
@@ -2307,9 +2310,11 @@ impl CorpusStorage {
23072310

23082311
let plan =
23092312
ExecutionPlan::from_disjunction(&prep.query, db, &self.query_config, timeout)?;
2313+
let mut total_count: u64 = 0;
23102314

23112315
for mgroup in plan {
23122316
let mgroup = mgroup?;
2317+
23132318
// for each match, extract the defined annotation (by its key) from the result node
23142319
let mut tuple: Vec<String> = Vec::with_capacity(annokeys.len());
23152320
for (node_ref, anno_keys) in &annokeys {
@@ -2328,7 +2333,8 @@ impl CorpusStorage {
23282333
let tuple_count: &mut usize = tuple_frequency.entry(tuple).or_insert(0);
23292334
*tuple_count += 1;
23302335

2331-
if *tuple_count % 1_000 == 0 {
2336+
total_count += 1;
2337+
if total_count.is_multiple_of(TIMEOUT_CHECK_TUPLE_COUNT) {
23322338
timeout.check()?;
23332339
}
23342340
}

graphannis/src/annis/operator.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ impl EdgeAnnoSearchSpec {
118118
}
119119

120120
/// Represents the different strategies to estimate the output of size of applying an operator.
121-
#[derive(Clone)]
121+
#[derive(Clone, Debug, PartialEq)]
122122
pub enum EstimationType {
123123
/// Estimate using the given selectivity.
124124
/// This means the cross product of the input sizes is multiplied with this factor to get the output size.

0 commit comments

Comments
 (0)