Skip to content

Commit ca637fb

Browse files
committed
Avoid loading the node annotation storage when listing the components
1 parent 8474176 commit ca637fb

3 files changed

Lines changed: 106 additions & 64 deletions

File tree

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
55

66
## [Unreleased]
77

8+
### Fixed
9+
10+
- Avoid loading the node annotation storage when listing the components for a
11+
corpus in the `CorpusStorage`. Before this change, querying for components via
12+
the webservice could block the corpus cache.
13+
814
## [3.8.0] - 2025-05-14
915

1016
### Changed

core/src/graph/mod.rs

Lines changed: 65 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ use crate::{
1414
use clru::CLruCache;
1515
use rayon::prelude::*;
1616
use smartstring::alias::String as SmartString;
17-
use std::io::prelude::*;
1817
use std::ops::Bound::Included;
1918
use std::path::{Path, PathBuf};
2019
use std::string::ToString;
@@ -23,6 +22,7 @@ use std::{
2322
sync::{Arc, Mutex},
2423
};
2524
use std::{collections::BTreeMap, num::NonZeroUsize};
25+
use std::{collections::BTreeSet, io::prelude::*};
2626
use update::{GraphUpdate, UpdateEvent};
2727

2828
pub const ANNIS_NS: &str = "annis";
@@ -113,6 +113,66 @@ fn component_path<CT: ComponentType>(
113113
}
114114
}
115115

116+
/// List all the components that belong to corpus in the given directory.
117+
pub fn find_components_from_disk<CT: ComponentType, P: AsRef<Path>>(
118+
location: P,
119+
) -> Result<BTreeSet<Component<CT>>> {
120+
let mut result = BTreeSet::new();
121+
// for all component types
122+
for c in CT::all_component_types().into_iter() {
123+
let cpath = PathBuf::from(location.as_ref())
124+
.join("gs")
125+
.join(c.to_string());
126+
127+
if cpath.is_dir() {
128+
// get all the namespaces/layers
129+
for layer in cpath.read_dir()? {
130+
let layer = layer?;
131+
if layer.path().is_dir() {
132+
// try to load the component with the empty name
133+
let layer_file_name = layer.file_name();
134+
let layer_name_from_file = layer_file_name.to_string_lossy();
135+
let layer_name: SmartString = if layer_name_from_file == DEFAULT_EMPTY_LAYER {
136+
SmartString::default()
137+
} else {
138+
layer_name_from_file.into()
139+
};
140+
let empty_name_component =
141+
Component::new(c.clone(), layer_name.clone(), SmartString::default());
142+
{
143+
let cfg_file = PathBuf::from(location.as_ref())
144+
.join(component_to_relative_path(&empty_name_component))
145+
.join("impl.cfg");
146+
147+
if cfg_file.is_file() {
148+
result.insert(empty_name_component.clone());
149+
debug!("Registered component {}", empty_name_component);
150+
}
151+
}
152+
// also load all named components
153+
for name in layer.path().read_dir()? {
154+
let name = name?;
155+
let named_component = Component::new(
156+
c.clone(),
157+
layer_name.clone(),
158+
name.file_name().to_string_lossy().into(),
159+
);
160+
let cfg_file = PathBuf::from(location.as_ref())
161+
.join(component_to_relative_path(&named_component))
162+
.join("impl.cfg");
163+
164+
if cfg_file.is_file() {
165+
result.insert(named_component.clone());
166+
debug!("Registered component {}", named_component);
167+
}
168+
}
169+
}
170+
}
171+
}
172+
} // end for all components
173+
Ok(result)
174+
}
175+
116176
impl<CT: ComponentType> Graph<CT> {
117177
/// Create a new and empty instance without any location on the disk.
118178
pub fn new(disk_based: bool) -> Result<Self> {
@@ -233,7 +293,10 @@ impl<CT: ComponentType> Graph<CT> {
233293

234294
let logfile_exists = log_path.exists() && log_path.is_file();
235295

236-
self.find_components_from_disk(&dir2load)?;
296+
self.components = find_components_from_disk(&dir2load)?
297+
.into_iter()
298+
.map(|c| (c, None))
299+
.collect();
237300

238301
// If backup is active or a write log exists, always a pre-load to get the complete corpus.
239302
if logfile_exists | load_from_backup {
@@ -287,63 +350,6 @@ impl<CT: ComponentType> Graph<CT> {
287350
Ok(())
288351
}
289352

290-
fn find_components_from_disk(&mut self, location: &Path) -> Result<()> {
291-
self.components.clear();
292-
293-
// for all component types
294-
for c in CT::all_component_types().into_iter() {
295-
let cpath = PathBuf::from(location).join("gs").join(c.to_string());
296-
297-
if cpath.is_dir() {
298-
// get all the namespaces/layers
299-
for layer in cpath.read_dir()? {
300-
let layer = layer?;
301-
if layer.path().is_dir() {
302-
// try to load the component with the empty name
303-
let layer_file_name = layer.file_name();
304-
let layer_name_from_file = layer_file_name.to_string_lossy();
305-
let layer_name: SmartString = if layer_name_from_file == DEFAULT_EMPTY_LAYER
306-
{
307-
SmartString::default()
308-
} else {
309-
layer_name_from_file.into()
310-
};
311-
let empty_name_component =
312-
Component::new(c.clone(), layer_name.clone(), SmartString::default());
313-
{
314-
let cfg_file = PathBuf::from(location)
315-
.join(component_to_relative_path(&empty_name_component))
316-
.join("impl.cfg");
317-
318-
if cfg_file.is_file() {
319-
self.components.insert(empty_name_component.clone(), None);
320-
debug!("Registered component {}", empty_name_component);
321-
}
322-
}
323-
// also load all named components
324-
for name in layer.path().read_dir()? {
325-
let name = name?;
326-
let named_component = Component::new(
327-
c.clone(),
328-
layer_name.clone(),
329-
name.file_name().to_string_lossy().into(),
330-
);
331-
let cfg_file = PathBuf::from(location)
332-
.join(component_to_relative_path(&named_component))
333-
.join("impl.cfg");
334-
335-
if cfg_file.is_file() {
336-
self.components.insert(named_component.clone(), None);
337-
debug!("Registered component {}", named_component);
338-
}
339-
}
340-
}
341-
}
342-
}
343-
} // end for all components
344-
Ok(())
345-
}
346-
347353
fn internal_save(&self, location: &Path) -> Result<()> {
348354
let location = PathBuf::from(location);
349355

graphannis/src/annis/db/corpusstorage.rs

Lines changed: 35 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2387,11 +2387,41 @@ impl CorpusStorage {
23872387
ctype: Option<AnnotationComponentType>,
23882388
name: Option<&str>,
23892389
) -> Result<Vec<Component<AnnotationComponentType>>> {
2390-
if let Ok(db_entry) = self.get_loaded_entry(corpus_name, false, false) {
2391-
let lock = db_entry.read()?;
2392-
if let Ok(db) = get_read_or_error(&lock) {
2393-
return Ok(db.get_all_components(ctype, name));
2394-
}
2390+
if let Ok(db_entry) = self.get_entry(corpus_name) {
2391+
// Lock the entry so no-one else can write to it
2392+
let _lock = db_entry.read()?;
2393+
// Get the components by listing the directories on the disk We
2394+
// could use `Graph:get_all_components() but this needs to load the
2395+
// annotation storage which can be costly.
2396+
let db_path = self.corpus_directory_on_disk(corpus_name);
2397+
let backup = db_path.join("backup");
2398+
2399+
let dir2load = if backup.exists() && backup.is_dir() {
2400+
backup.clone()
2401+
} else {
2402+
db_path.join("current")
2403+
};
2404+
let components = graphannis_core::graph::find_components_from_disk::<
2405+
AnnotationComponentType,
2406+
_,
2407+
>(&dir2load)?
2408+
.into_iter()
2409+
.filter(|c| {
2410+
if let Some(ctype) = ctype.as_ref() {
2411+
&c.get_type() == ctype
2412+
} else {
2413+
true
2414+
}
2415+
})
2416+
.filter(|c| {
2417+
if let Some(name) = name {
2418+
c.name.as_str() == name
2419+
} else {
2420+
true
2421+
}
2422+
})
2423+
.collect();
2424+
return Ok(components);
23952425
}
23962426
Ok(vec![])
23972427
}

0 commit comments

Comments
 (0)