Skip to content

Commit 8173d43

Browse files
authored
Merge pull request #322 from korpling/fix-partof-estimation
Improve selectivity estimation for @-Queries
2 parents e31cfc2 + 7966e8d commit 8173d43

96 files changed

Lines changed: 574 additions & 89 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.gitattributes

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
1-
*.graphml text eol=lf
1+
*.graphml text eol=lf
2+
third-party-licenses.html linguist-generated
3+
/graphannis/tests/data/ linguist-generated=true

.github/workflows/verify.yml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,6 @@
11
name: Verify
22

33
on:
4-
push:
5-
branches:
6-
- main
74
pull_request:
85

96
jobs:
@@ -66,4 +63,3 @@ jobs:
6663
```
6764
${{env.COVERAGE_INFO}}
6865
```
69-

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1616
- New optional `file` option for the `[logging]` section in the webservice
1717
configuration. Can be used to additionally output all log messages to the given
1818
file.
19+
- Add number of root nodes to graph storage statistics. This changes the way
20+
most of the graph storages store their statistics. You can use old imported data
21+
files, but to make use of the new information you queries, you have to
22+
**reimport** your corpora.
1923
- `Graph:ensure_loaded_parallel` returns the actually loaded components that did
2024
exist.
2125

2226
### Fixed
2327

2428
- Less frequent corpus cache status updates in log. Before, every corpus access
2529
could trigger an entry into the log which is not desired under heavy load.
30+
- Improve query execution planning by assuming all annotations can be matched in
31+
regular expressions without a prefix.
2632

2733
## [3.7.1] - 2025-04-14
2834

cli/src/bin/annis.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -321,7 +321,7 @@ impl AnnisRunner {
321321
format = ExportFormat::GraphMLZip;
322322
} else if file_ext.to_string_lossy() == ".graphml" && self.current_corpus.len() != 1 {
323323
bail!(
324-
r##"You need to select a *single* corpus first with the \"corpus\" command when exporting to a GraphML file.
324+
r##"You need to select a *single* corpus first with the \"corpus\" command when exporting to a GraphML file.
325325
To export multiple corpora, select a directory as output or a ZIP file (ending with .zip)"##
326326
);
327327
}
@@ -442,7 +442,7 @@ impl AnnisRunner {
442442
"unsorted" => ResultOrder::NotSorted,
443443
_ => {
444444
return Err(anyhow!(
445-
"Non-existing order with name {}.
445+
"Non-existing order with name {}.
446446
Must be one of \"normal\", \"inverted\", \"random\", \"unsorted\"",
447447
args
448448
));

cli/tests/cli.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ fn show_corpus_info() -> Result<(), Box<dyn std::error::Error>> {
2626

2727
cmd.arg("../graphannis/tests/data/")
2828
.arg("-c")
29-
.arg("corpus sample-disk-based-3.3")
29+
.arg("corpus sample-disk-based-3.8")
3030
.arg("-c")
3131
.arg("preload")
3232
.arg("-c")

cli/tests/snapshots/cli__list_corpora_fully_loaded.snap

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ exit_code: 0
2020
sample-disk-based-1.5 (not loaded)
2121
sample-disk-based-3.2 (not loaded)
2222
sample-disk-based-3.3 (fully loaded)
23+
sample-disk-based-3.8 (not loaded)
2324
sample-memory-based-1.5 (not loaded)
2425
sample-memory-based-3.2 (not loaded)
2526
sample-memory-based-3.3 (not loaded)

cli/tests/snapshots/cli__list_corpora_not_loaded.snap

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,10 @@ exit_code: 0
1313
sample-disk-based-1.5 (not loaded)
1414
sample-disk-based-3.2 (not loaded)
1515
sample-disk-based-3.3 (not loaded)
16+
sample-disk-based-3.8 (not loaded)
1617
sample-memory-based-1.5 (not loaded)
1718
sample-memory-based-3.2 (not loaded)
1819
sample-memory-based-3.3 (not loaded)
1920
graphANNIS says good-bye!
2021

2122
----- stderr -----
22-

cli/tests/snapshots/cli__list_corpora_partially_loaded.snap

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ result: 44 matches in 4 documents
2121
sample-disk-based-1.5 (not loaded)
2222
sample-disk-based-3.2 (not loaded)
2323
sample-disk-based-3.3 (partially loaded)
24+
sample-disk-based-3.8 (not loaded)
2425
sample-memory-based-1.5 (not loaded)
2526
sample-memory-based-3.2 (not loaded)
2627
sample-memory-based-3.3 (not loaded)

cli/tests/snapshots/cli__show_corpus_info.snap

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ info:
55
args:
66
- "../graphannis/tests/data/"
77
- "-c"
8-
- corpus sample-disk-based-3.3
8+
- corpus sample-disk-based-3.8
99
- "-c"
1010
- preload
1111
- "-c"
@@ -14,59 +14,59 @@ info:
1414
success: true
1515
exit_code: 0
1616
----- stdout -----
17-
12:00:00[INFO] Loaded corpus sample-disk-based-3.3
18-
12:00:00[INFO] Corpus cache after preloading sample-disk-based-3.3: 100MB / 300MB - loaded corpora [sample-disk-based-3.3]
17+
12:00:00[INFO] Loaded corpus sample-disk-based-3.8
18+
12:00:00[INFO] Corpus cache after preloading sample-disk-based-3.8: 100MB / 300MB - loaded corpora [sample-disk-based-3.8]
1919
12:00:00[INFO] Preloaded corpus in 10 ms
2020
Status: "fully loaded"
2121
Token search shortcut possible: true
2222
------------
2323
Component Coverage//: 0 annnotations
24-
Stats: nodes=92, avg_fan_out=2.17, max_fan_out=11, max_depth=1
24+
Stats: nodes=92, root nodes=48, avg_fan_out=2.17, max_fan_out=11, fan_out_99%=11, inv_fan_out_99%=9, max_depth=1
2525
Implementation: DiskAdjacencyListV1
2626
Status: "fully loaded"
2727
------------
2828
Component Coverage/annis/: 0 annnotations
29-
Stats: nodes=0, avg_fan_out=0.00, max_fan_out=0, max_depth=1, tree
29+
Stats: nodes=0, root nodes=0, avg_fan_out=0.00, max_fan_out=0, fan_out_99%=0, inv_fan_out_99%=0, max_depth=1, tree
3030
Implementation: DiskAdjacencyListV1
3131
Status: "fully loaded"
3232
------------
3333
Component Coverage/default_ns/: 0 annnotations
34-
Stats: nodes=56, avg_fan_out=0.93, max_fan_out=10, max_depth=1
34+
Stats: nodes=56, root nodes=12, avg_fan_out=0.93, max_fan_out=10, fan_out_99%=10, inv_fan_out_99%=2, max_depth=1
3535
Implementation: DiskAdjacencyListV1
3636
Status: "fully loaded"
3737
------------
3838
Component Coverage/annis/inherited-coverage: 0 annnotations
39-
Stats: nodes=0, avg_fan_out=0.00, max_fan_out=0, max_depth=1, tree
39+
Stats: nodes=0, root nodes=0, avg_fan_out=0.00, max_fan_out=0, fan_out_99%=0, inv_fan_out_99%=0, max_depth=1, tree
4040
Implementation: DiskAdjacencyListV1
4141
Status: "fully loaded"
4242
------------
4343
Component Dominance/syntax/: 0 annnotations
44-
Stats: nodes=92, avg_fan_out=0.96, max_fan_out=3, max_depth=9, tree
44+
Stats: nodes=92, root nodes=4, avg_fan_out=0.96, max_fan_out=3, fan_out_99%=3, inv_fan_out_99%=1, max_depth=9, tree
4545
Implementation: PrePostOrderO16L8V1
4646
Status: "fully loaded"
4747
------------
4848
Component Pointing/default_ns/anaphoric: 0 annnotations
49-
Stats: nodes=8, avg_fan_out=0.50, max_fan_out=1, max_depth=1, tree
49+
Stats: nodes=8, root nodes=4, avg_fan_out=0.50, max_fan_out=1, fan_out_99%=1, inv_fan_out_99%=1, max_depth=1, tree
5050
Implementation: DiskPathV1_D15
5151
Status: "fully loaded"
5252
------------
5353
Component Ordering/annis/: 0 annnotations
54-
Stats: nodes=44, avg_fan_out=0.91, max_fan_out=1, max_depth=10, tree
54+
Stats: nodes=44, root nodes=4, avg_fan_out=0.91, max_fan_out=1, fan_out_99%=1, inv_fan_out_99%=1, max_depth=10, tree
5555
Implementation: DiskPathV1_D15
5656
Status: "fully loaded"
5757
------------
5858
Component LeftToken/annis/: 0 annnotations
59-
Stats: nodes=92, avg_fan_out=0.65, max_fan_out=1, max_depth=1
59+
Stats: nodes=92, root nodes=60, avg_fan_out=0.65, max_fan_out=1, fan_out_99%=1, inv_fan_out_99%=3, max_depth=1
6060
Implementation: DiskPathV1_D15
6161
Status: "fully loaded"
6262
------------
6363
Component RightToken/annis/: 0 annnotations
64-
Stats: nodes=84, avg_fan_out=0.71, max_fan_out=1, max_depth=1
64+
Stats: nodes=84, root nodes=60, avg_fan_out=0.71, max_fan_out=1, fan_out_99%=1, inv_fan_out_99%=8, max_depth=1
6565
Implementation: DiskPathV1_D15
6666
Status: "fully loaded"
6767
------------
6868
Component PartOf/annis/: 0 annnotations
69-
Stats: nodes=115, avg_fan_out=0.99, max_fan_out=1, max_depth=4
69+
Stats: nodes=115, root nodes=104, avg_fan_out=0.99, max_fan_out=1, fan_out_99%=1, inv_fan_out_99%=26, max_depth=4
7070
Implementation: DiskPathV1_D15
7171
Status: "fully loaded"
7272
------------

core/src/annostorage/inmemory.rs

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -720,7 +720,6 @@ where
720720
if let Some(anno_key) = self.anno_keys.get_symbol(&anno_key) {
721721
if let Some(histo) = self.histogram_bounds.get(&anno_key) {
722722
// find the range in which the value is contained
723-
724723
// we need to make sure the histogram is not empty -> should have at least two bounds
725724
if histo.len() >= 2 {
726725
sum_histogram_buckets += histo.len() - 1;
@@ -752,6 +751,10 @@ where
752751
fn guess_max_count_regex(&self, ns: Option<&str>, name: &str, pattern: &str) -> Result<usize> {
753752
let full_match_pattern = util::regex_full_match(pattern);
754753

754+
// Get the total number of annotations with the namespace/name. We
755+
// can't get larger than this number
756+
let total = self.number_of_annotations_by_name(ns, name)?;
757+
755758
// Try to parse the regular expression
756759
let parsed = regex_syntax::Parser::new().parse(&full_match_pattern);
757760
if let Ok(parsed) = parsed {
@@ -770,11 +773,13 @@ where
770773
guessed_count += self.guess_max_count(ns, name, lower_val, &upper_val)?;
771774
}
772775
}
776+
} else {
777+
// For regular expressions without a prefix the worst case would be `.*[X].*` where `[X]` are the most common characters.
778+
// Assume that a generic percentage (here 5%) of all nodes match the regex.
779+
// TODO: find better ways of estimating this constant
780+
guessed_count = (0.05 * (total as f64)) as usize;
773781
}
774782

775-
// Get the total number of annotations with the namespace/name. We
776-
// can't get larger than this number
777-
let total = self.number_of_annotations_by_name(ns, name)?;
778783
Ok(guessed_count.min(total))
779784
} else {
780785
Ok(0)

0 commit comments

Comments
 (0)