Skip to content

Commit 67e17c9

Browse files
Check filesystem for existing corpora on creation, import and deletion
1 parent c368b15 commit 67e17c9

2 files changed

Lines changed: 241 additions & 28 deletions

File tree

graphannis/src/annis/db/corpusstorage.rs

Lines changed: 36 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -991,17 +991,23 @@ impl CorpusStorage {
991991
check_cache_size_and_remove_with_cache(cache, &self.cache_strategy, vec![], false)?;
992992

993993
// remove any possible old corpus
994-
if cache.contains_key(&corpus_name) {
995-
if overwrite_existing {
996-
let old_entry = cache.remove(&corpus_name);
997-
if old_entry.is_some() {
998-
if let Err(e) = std::fs::remove_dir_all(db_path.clone()) {
999-
error!("Error when removing existing files {}", e);
1000-
}
994+
if overwrite_existing {
995+
let old_entry = cache.remove(&corpus_name);
996+
997+
// if there is a cache entry, acquire an exclusive lock for it because
998+
// other queries or background writers might still have access to it and need to finish first
999+
let _lock = old_entry
1000+
.as_ref()
1001+
.map(|db_entry| db_entry.write())
1002+
.transpose()?;
1003+
1004+
if db_path.is_dir() {
1005+
if let Err(e) = std::fs::remove_dir_all(&db_path) {
1006+
error!("Error when removing existing files {}", e);
10011007
}
1002-
} else {
1003-
return Err(GraphAnnisError::CorpusExists(corpus_name.to_string()));
10041008
}
1009+
} else if cache.contains_key(&corpus_name) || db_path.is_dir() {
1010+
return Err(GraphAnnisError::CorpusExists(corpus_name.to_string()));
10051011
}
10061012

10071013
if let Err(e) = std::fs::create_dir_all(&db_path) {
@@ -1355,24 +1361,27 @@ impl CorpusStorage {
13551361
let db_path = self.corpus_directory_on_disk(corpus_name);
13561362

13571363
let mut cache_lock = self.corpus_cache.write()?;
1358-
13591364
let cache = &mut *cache_lock;
13601365

1361-
// remove any possible old corpus
1362-
if let Some(db_entry) = cache.remove(corpus_name) {
1363-
// aquire exclusive lock for this cache entry because
1364-
// other queries or background writer might still have access it and need to finish first
1365-
let mut _lock = db_entry.write()?;
1366-
1367-
if db_path.is_dir() && db_path.exists() {
1368-
std::fs::remove_dir_all(db_path).map_err(|e| {
1369-
CorpusStorageError::RemoveFileForCorpus {
1370-
corpus: corpus_name.to_string(),
1371-
source: e,
1372-
}
1373-
})?
1374-
}
1366+
let db_entry = cache.remove(corpus_name);
1367+
1368+
// if there is a cache entry, acquire an exclusive lock for it because
1369+
// other queries or background writers might still have access to it and need to finish first
1370+
let _lock = db_entry
1371+
.as_ref()
1372+
.map(|db_entry| db_entry.write())
1373+
.transpose()?;
1374+
1375+
if db_path.is_dir() {
1376+
std::fs::remove_dir_all(db_path).map_err(|e| {
1377+
CorpusStorageError::RemoveFileForCorpus {
1378+
corpus: corpus_name.to_string(),
1379+
source: e,
1380+
}
1381+
})?;
13751382

1383+
Ok(true)
1384+
} else if db_entry.is_some() {
13761385
Ok(true)
13771386
} else {
13781387
Ok(false)
@@ -1384,11 +1393,12 @@ impl CorpusStorage {
13841393
/// Use [`apply_update`](CorpusStorage::apply_update) to add elements to the corpus. Returns whether a
13851394
/// new corpus was created.
13861395
pub fn create_empty_corpus(&self, corpus_name: &str, disk_based: bool) -> Result<bool> {
1387-
let mut cache_lock = self.corpus_cache.write()?;
1396+
let db_path = self.corpus_directory_on_disk(corpus_name);
13881397

1398+
let mut cache_lock = self.corpus_cache.write()?;
13891399
let cache = &mut *cache_lock;
13901400

1391-
if cache.contains_key(corpus_name) {
1401+
if cache.contains_key(corpus_name) || db_path.is_dir() {
13921402
Ok(false)
13931403
} else {
13941404
self.load_entry_with_lock(&mut cache_lock, corpus_name, true, disk_based)?;

graphannis/src/annis/db/corpusstorage/tests.rs

Lines changed: 205 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ use pretty_assertions::assert_eq;
2525
use super::SearchQuery;
2626

2727
#[test]
28-
fn delete() {
28+
fn delete_existing_cached_corpus() {
2929
let tmp = tempfile::tempdir().unwrap();
3030
let cs = CorpusStorage::with_auto_cache_size(tmp.path(), false).unwrap();
3131
// fully load a corpus
@@ -38,7 +38,81 @@ fn delete() {
3838

3939
cs.apply_update("testcorpus", &mut g).unwrap();
4040
cs.preload("testcorpus").unwrap();
41-
cs.delete("testcorpus").unwrap();
41+
42+
let deleted = cs.delete("testcorpus").unwrap();
43+
44+
assert_eq!(true, deleted);
45+
}
46+
47+
#[test]
48+
fn delete_existing_uncached_corpus() {
49+
let tmp = tempfile::tempdir().unwrap();
50+
{
51+
let cs = CorpusStorage::with_auto_cache_size(tmp.path(), false).unwrap();
52+
let mut g = GraphUpdate::new();
53+
g.add_event(UpdateEvent::AddNode {
54+
node_name: "test".to_string(),
55+
node_type: "node".to_string(),
56+
})
57+
.unwrap();
58+
59+
cs.apply_update("testcorpus", &mut g).unwrap();
60+
}
61+
62+
{
63+
let cs = CorpusStorage::with_auto_cache_size(tmp.path(), false).unwrap();
64+
65+
let deleted = cs.delete("testcorpus").unwrap();
66+
67+
assert_eq!(true, deleted);
68+
}
69+
}
70+
71+
#[test]
72+
fn delete_nonexisting_corpus() {
73+
let tmp = tempfile::tempdir().unwrap();
74+
let cs = CorpusStorage::with_auto_cache_size(tmp.path(), false).unwrap();
75+
76+
let deleted = cs.delete("testcorpus").unwrap();
77+
78+
assert_eq!(false, deleted);
79+
}
80+
81+
#[test]
82+
fn create_empty_corpus_existing_cached() {
83+
let tmp = tempfile::tempdir().unwrap();
84+
let cs = CorpusStorage::with_auto_cache_size(tmp.path(), false).unwrap();
85+
cs.create_empty_corpus("testcorpus", false).unwrap();
86+
87+
let created = cs.create_empty_corpus("testcorpus", false).unwrap();
88+
89+
assert_eq!(false, created);
90+
}
91+
92+
#[test]
93+
fn create_empty_corpus_existing_uncached() {
94+
let tmp = tempfile::tempdir().unwrap();
95+
{
96+
let cs = CorpusStorage::with_auto_cache_size(tmp.path(), false).unwrap();
97+
cs.create_empty_corpus("testcorpus", false).unwrap();
98+
}
99+
{
100+
let cs = CorpusStorage::with_auto_cache_size(tmp.path(), false).unwrap();
101+
102+
let created = cs.create_empty_corpus("testcorpus", false).unwrap();
103+
104+
assert_eq!(false, created);
105+
}
106+
}
107+
108+
#[test]
109+
fn create_empty_corpus_nonexisting() {
110+
let tmp = tempfile::tempdir().unwrap();
111+
let cs = CorpusStorage::with_auto_cache_size(tmp.path(), false).unwrap();
112+
113+
let created = cs.create_empty_corpus("testcorpus", false).unwrap();
114+
115+
assert_eq!(true, created);
42116
}
43117

44118
#[test]
@@ -1257,6 +1331,135 @@ fn import_relative_corpus_with_linked_file() {
12571331
assert_eq!("The content of this file is not important.", file_content);
12581332
}
12591333

1334+
#[test]
1335+
fn import_existing_cached_corpus_no_overwrite() {
1336+
let tmp = tempfile::tempdir().unwrap();
1337+
let cargo_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
1338+
1339+
let cs = CorpusStorage::with_auto_cache_size(tmp.path(), true).unwrap();
1340+
cs.create_empty_corpus("testcorpus", false).unwrap();
1341+
let result = cs.import_from_fs(
1342+
&cargo_dir.join("tests/SaltSampleCorpus.graphml"),
1343+
ImportFormat::GraphML,
1344+
Some("testcorpus".into()),
1345+
false,
1346+
false,
1347+
|_| {},
1348+
);
1349+
1350+
assert!(matches!(result, Err(GraphAnnisError::CorpusExists(_))));
1351+
}
1352+
1353+
#[test]
1354+
fn import_existing_uncached_corpus_no_overwrite() {
1355+
let tmp = tempfile::tempdir().unwrap();
1356+
let cargo_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
1357+
1358+
{
1359+
let cs = CorpusStorage::with_auto_cache_size(tmp.path(), true).unwrap();
1360+
cs.create_empty_corpus("testcorpus", false).unwrap();
1361+
}
1362+
{
1363+
let cs = CorpusStorage::with_auto_cache_size(tmp.path(), true).unwrap();
1364+
let result = cs.import_from_fs(
1365+
&cargo_dir.join("tests/SaltSampleCorpus.graphml"),
1366+
ImportFormat::GraphML,
1367+
Some("testcorpus".into()),
1368+
false,
1369+
false,
1370+
|_| {},
1371+
);
1372+
1373+
assert!(matches!(result, Err(GraphAnnisError::CorpusExists(_))));
1374+
}
1375+
}
1376+
1377+
#[test]
1378+
fn import_existing_cached_corpus_overwrite() {
1379+
let tmp = tempfile::tempdir().unwrap();
1380+
let cargo_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
1381+
1382+
let cs = CorpusStorage::with_auto_cache_size(tmp.path(), true).unwrap();
1383+
cs.import_from_fs(
1384+
&cargo_dir.join("tests/SegmentationWithGaps.graphml"),
1385+
ImportFormat::GraphML,
1386+
Some("testcorpus".into()),
1387+
false,
1388+
false,
1389+
|_| {},
1390+
)
1391+
.unwrap();
1392+
1393+
let num_ordering_components = cs
1394+
.list_components("testcorpus", Some(AnnotationComponentType::Ordering), None)
1395+
.unwrap()
1396+
.len();
1397+
assert_eq!(3, num_ordering_components);
1398+
1399+
cs.import_from_fs(
1400+
&cargo_dir.join("tests/SaltSampleCorpus.graphml"),
1401+
ImportFormat::GraphML,
1402+
Some("testcorpus".into()),
1403+
false,
1404+
true,
1405+
|_| {},
1406+
)
1407+
.unwrap();
1408+
1409+
// Check that the number of ordering components has decreased,
1410+
// showing that the new corpus was not just added on top of the old one
1411+
let num_ordering_components = cs
1412+
.list_components("testcorpus", Some(AnnotationComponentType::Ordering), None)
1413+
.unwrap()
1414+
.len();
1415+
assert_eq!(1, num_ordering_components);
1416+
}
1417+
1418+
#[test]
1419+
fn import_existing_uncached_corpus_overwrite() {
1420+
let tmp = tempfile::tempdir().unwrap();
1421+
let cargo_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
1422+
1423+
{
1424+
let cs = CorpusStorage::with_auto_cache_size(tmp.path(), true).unwrap();
1425+
cs.import_from_fs(
1426+
&cargo_dir.join("tests/SegmentationWithGaps.graphml"),
1427+
ImportFormat::GraphML,
1428+
Some("testcorpus".into()),
1429+
false,
1430+
false,
1431+
|_| {},
1432+
)
1433+
.unwrap();
1434+
1435+
let num_ordering_components = cs
1436+
.list_components("testcorpus", Some(AnnotationComponentType::Ordering), None)
1437+
.unwrap()
1438+
.len();
1439+
assert_eq!(3, num_ordering_components);
1440+
}
1441+
{
1442+
let cs = CorpusStorage::with_auto_cache_size(tmp.path(), true).unwrap();
1443+
cs.import_from_fs(
1444+
&cargo_dir.join("tests/SaltSampleCorpus.graphml"),
1445+
ImportFormat::GraphML,
1446+
Some("testcorpus".into()),
1447+
false,
1448+
true,
1449+
|_| {},
1450+
)
1451+
.unwrap();
1452+
1453+
// Check that the number of ordering components has decreased,
1454+
// showing that the new corpus was not just added on top of the old one
1455+
let num_ordering_components = cs
1456+
.list_components("testcorpus", Some(AnnotationComponentType::Ordering), None)
1457+
.unwrap()
1458+
.len();
1459+
assert_eq!(1, num_ordering_components);
1460+
}
1461+
}
1462+
12601463
#[test]
12611464
#[serial]
12621465
fn load_legacy_binary_corpus_1() {

0 commit comments

Comments
 (0)