Skip to content

Commit afe7bde

Browse files
authored
Merge pull request #324 from korpling/get-components-without-loading-db
Avoid loading the node annotation storage when listing the components
2 parents 8474176 + b58d8e3 commit afe7bde

5 files changed

Lines changed: 114 additions & 72 deletions

File tree

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
55

66
## [Unreleased]
77

8+
### Fixed
9+
10+
- Avoid loading the node annotation storage when listing the components for a
11+
corpus in the `CorpusStorage`. Before this change, querying for components via
12+
the webservice could block the corpus cache.
13+
814
## [3.8.0] - 2025-05-14
915

1016
### Changed

core/src/graph/mod.rs

Lines changed: 66 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ use crate::{
1414
use clru::CLruCache;
1515
use rayon::prelude::*;
1616
use smartstring::alias::String as SmartString;
17-
use std::io::prelude::*;
1817
use std::ops::Bound::Included;
1918
use std::path::{Path, PathBuf};
2019
use std::string::ToString;
@@ -23,6 +22,7 @@ use std::{
2322
sync::{Arc, Mutex},
2423
};
2524
use std::{collections::BTreeMap, num::NonZeroUsize};
25+
use std::{collections::BTreeSet, io::prelude::*};
2626
use update::{GraphUpdate, UpdateEvent};
2727

2828
pub const ANNIS_NS: &str = "annis";
@@ -113,6 +113,66 @@ fn component_path<CT: ComponentType>(
113113
}
114114
}
115115

116+
/// List all the components that belong to corpus in the given directory.
117+
pub fn find_components_from_disk<CT: ComponentType, P: AsRef<Path>>(
118+
location: P,
119+
) -> Result<BTreeSet<Component<CT>>> {
120+
let mut result = BTreeSet::new();
121+
// for all component types
122+
for c in CT::all_component_types().into_iter() {
123+
let cpath = PathBuf::from(location.as_ref())
124+
.join("gs")
125+
.join(c.to_string());
126+
127+
if cpath.is_dir() {
128+
// get all the namespaces/layers
129+
for layer in cpath.read_dir()? {
130+
let layer = layer?;
131+
if layer.path().is_dir() {
132+
// try to load the component with the empty name
133+
let layer_file_name = layer.file_name();
134+
let layer_name_from_file = layer_file_name.to_string_lossy();
135+
let layer_name: SmartString = if layer_name_from_file == DEFAULT_EMPTY_LAYER {
136+
SmartString::default()
137+
} else {
138+
layer_name_from_file.into()
139+
};
140+
let empty_name_component =
141+
Component::new(c.clone(), layer_name.clone(), SmartString::default());
142+
{
143+
let cfg_file = PathBuf::from(location.as_ref())
144+
.join(component_to_relative_path(&empty_name_component))
145+
.join("impl.cfg");
146+
147+
if cfg_file.is_file() {
148+
result.insert(empty_name_component.clone());
149+
debug!("Registered component {}", empty_name_component);
150+
}
151+
}
152+
// also load all named components
153+
for name in layer.path().read_dir()? {
154+
let name = name?;
155+
let named_component = Component::new(
156+
c.clone(),
157+
layer_name.clone(),
158+
name.file_name().to_string_lossy().into(),
159+
);
160+
let cfg_file = PathBuf::from(location.as_ref())
161+
.join(component_to_relative_path(&named_component))
162+
.join("impl.cfg");
163+
164+
if cfg_file.is_file() {
165+
result.insert(named_component.clone());
166+
debug!("Registered component {}", named_component);
167+
}
168+
}
169+
}
170+
}
171+
}
172+
} // end for all components
173+
Ok(result)
174+
}
175+
116176
impl<CT: ComponentType> Graph<CT> {
117177
/// Create a new and empty instance without any location on the disk.
118178
pub fn new(disk_based: bool) -> Result<Self> {
@@ -233,7 +293,10 @@ impl<CT: ComponentType> Graph<CT> {
233293

234294
let logfile_exists = log_path.exists() && log_path.is_file();
235295

236-
self.find_components_from_disk(&dir2load)?;
296+
self.components = find_components_from_disk(&dir2load)?
297+
.into_iter()
298+
.map(|c| (c, None))
299+
.collect();
237300

238301
// If backup is active or a write log exists, always a pre-load to get the complete corpus.
239302
if logfile_exists | load_from_backup {
@@ -287,63 +350,6 @@ impl<CT: ComponentType> Graph<CT> {
287350
Ok(())
288351
}
289352

290-
fn find_components_from_disk(&mut self, location: &Path) -> Result<()> {
291-
self.components.clear();
292-
293-
// for all component types
294-
for c in CT::all_component_types().into_iter() {
295-
let cpath = PathBuf::from(location).join("gs").join(c.to_string());
296-
297-
if cpath.is_dir() {
298-
// get all the namespaces/layers
299-
for layer in cpath.read_dir()? {
300-
let layer = layer?;
301-
if layer.path().is_dir() {
302-
// try to load the component with the empty name
303-
let layer_file_name = layer.file_name();
304-
let layer_name_from_file = layer_file_name.to_string_lossy();
305-
let layer_name: SmartString = if layer_name_from_file == DEFAULT_EMPTY_LAYER
306-
{
307-
SmartString::default()
308-
} else {
309-
layer_name_from_file.into()
310-
};
311-
let empty_name_component =
312-
Component::new(c.clone(), layer_name.clone(), SmartString::default());
313-
{
314-
let cfg_file = PathBuf::from(location)
315-
.join(component_to_relative_path(&empty_name_component))
316-
.join("impl.cfg");
317-
318-
if cfg_file.is_file() {
319-
self.components.insert(empty_name_component.clone(), None);
320-
debug!("Registered component {}", empty_name_component);
321-
}
322-
}
323-
// also load all named components
324-
for name in layer.path().read_dir()? {
325-
let name = name?;
326-
let named_component = Component::new(
327-
c.clone(),
328-
layer_name.clone(),
329-
name.file_name().to_string_lossy().into(),
330-
);
331-
let cfg_file = PathBuf::from(location)
332-
.join(component_to_relative_path(&named_component))
333-
.join("impl.cfg");
334-
335-
if cfg_file.is_file() {
336-
self.components.insert(named_component.clone(), None);
337-
debug!("Registered component {}", named_component);
338-
}
339-
}
340-
}
341-
}
342-
}
343-
} // end for all components
344-
Ok(())
345-
}
346-
347353
fn internal_save(&self, location: &Path) -> Result<()> {
348354
let location = PathBuf::from(location);
349355

@@ -376,7 +382,7 @@ impl<CT: ComponentType> Graph<CT> {
376382

377383
fn get_cached_node_id_from_name(
378384
&self,
379-
node_name: Cow<String>,
385+
node_name: Cow<str>,
380386
cache: &mut CLruCache<String, Option<NodeID>>,
381387
) -> Result<Option<NodeID>> {
382388
if let Some(id) = cache.get(node_name.as_ref()) {

graphannis/src/annis/db/aql/model.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,10 @@ pub struct AQLUpdateGraphIndex {
167167
}
168168

169169
impl AQLUpdateGraphIndex {
170+
#[allow(
171+
clippy::owned_cow,
172+
reason = "We don't want to copy the node_name if not necessary and also have to look up &String"
173+
)]
170174
fn get_cached_node_id_from_name(
171175
&mut self,
172176
node_name: Cow<String>,

graphannis/src/annis/db/corpusstorage.rs

Lines changed: 35 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2387,11 +2387,41 @@ impl CorpusStorage {
23872387
ctype: Option<AnnotationComponentType>,
23882388
name: Option<&str>,
23892389
) -> Result<Vec<Component<AnnotationComponentType>>> {
2390-
if let Ok(db_entry) = self.get_loaded_entry(corpus_name, false, false) {
2391-
let lock = db_entry.read()?;
2392-
if let Ok(db) = get_read_or_error(&lock) {
2393-
return Ok(db.get_all_components(ctype, name));
2394-
}
2390+
if let Ok(db_entry) = self.get_entry(corpus_name) {
2391+
// Lock the entry so no-one else can write to it
2392+
let _lock = db_entry.read()?;
2393+
// Get the components by listing the directories on the disk We
2394+
// could use `Graph:get_all_components() but this needs to load the
2395+
// annotation storage which can be costly.
2396+
let db_path = self.corpus_directory_on_disk(corpus_name);
2397+
let backup = db_path.join("backup");
2398+
2399+
let dir2load = if backup.exists() && backup.is_dir() {
2400+
backup.clone()
2401+
} else {
2402+
db_path.join("current")
2403+
};
2404+
let components = graphannis_core::graph::find_components_from_disk::<
2405+
AnnotationComponentType,
2406+
_,
2407+
>(&dir2load)?
2408+
.into_iter()
2409+
.filter(|c| {
2410+
if let Some(ctype) = ctype.as_ref() {
2411+
&c.get_type() == ctype
2412+
} else {
2413+
true
2414+
}
2415+
})
2416+
.filter(|c| {
2417+
if let Some(name) = name {
2418+
c.name.as_str() == name
2419+
} else {
2420+
true
2421+
}
2422+
})
2423+
.collect();
2424+
return Ok(components);
23952425
}
23962426
Ok(vec![])
23972427
}

webservice/src/main.rs

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ use simplelog::{
3333
};
3434
use std::fs::OpenOptions;
3535
use std::{
36-
io::{Error, ErrorKind, Result},
36+
io::{Error, Result},
3737
path::PathBuf,
3838
};
3939

@@ -270,12 +270,8 @@ async fn get_api_html_docs(_req: HttpRequest) -> HttpResponse {
270270
#[actix_web::main]
271271
async fn main() -> Result<()> {
272272
// Initialize application and its state
273-
let (cs, settings, db_pool) = init_app_state().map_err(|e| {
274-
Error::new(
275-
ErrorKind::Other,
276-
format!("Could not initialize graphANNIS service: {:?}", e),
277-
)
278-
})?;
273+
let (cs, settings, db_pool) = init_app_state()
274+
.map_err(|e| Error::other(format!("Could not initialize graphANNIS service: {:?}", e)))?;
279275

280276
let bind_address = format!("{}:{}", &settings.bind.host, &settings.bind.port);
281277
let cs = web::Data::new(cs);

0 commit comments

Comments
 (0)