Skip to content

Commit 2aa4990

Browse files
authored
Merge pull request #17 from ssjunnebo/remove_metadata_upload
Remove parsing and upload of metadata files to couchdb
2 parents d632350 + 56eb634 commit 2aa4990

8 files changed

Lines changed: 555 additions & 134 deletions

File tree

README.md

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ Dataflow Transfer monitors sequencing run directories and orchestrates the trans
2020
- Dependencies listed in [requirements.txt](requirements.txt):
2121
- PyYAML
2222
- click
23-
- xmltodict
2423
- ibmcloudant
2524
- [run-one](https://launchpad.net/ubuntu/+source/run-one)
2625

@@ -91,16 +90,13 @@ statusdb:
9190
username: couchdb_user
9291
password: couchdb_password
9392
url: couchdb.host.com
94-
database: sequencing_runs
93+
database: flowcell_status
9594

9695
sequencers:
9796
NovaSeqXPlus:
9897
sequencing_path: /sequencing/NovaSeqXPlus
9998
remote_destination: /Illumina/NovaSeqXPlus
10099
metadata_archive: /path/to/metadata/archive/NovaSeqXPlus_data
101-
metadata_for_statusdb:
102-
- RunInfo.xml
103-
- RunParameters.xml
104100
ignore_folders:
105101
- nosync
106102
remote_rsync_options:
@@ -115,8 +111,8 @@ sequencers:
115111
1. **Discovery**: Scans configured sequencing directories for run folders
116112
2. **Validation**: Confirms run ID matches expected format for the sequencer type
117113
3. **Transfer Phases**:
118-
- **Sequencing Phase**: Starts continuous background rsync transfer while sequencing is ongoing (when the final sequencing file doesn't exist). Uploads status and metadata files (specified for each sequencer type in the config with `metadata_for_statusdb`) to database.
119-
- **Final Transfer**: After sequencing completes (final sequencing file appears), syncs specified metadata file to archive location, initiates final rsync transfer and captures exit codes.
114+
- **Sequencing Phase**: Starts continuous background rsync transfer while sequencing is ongoing (when the final sequencing file doesn't exist). Uploads status to database.
115+
- **Final Transfer**: After sequencing completes (final sequencing file appears), syncs specified metadata files to archive location, initiates final rsync transfer and captures exit codes.
120116
- **Completion**: Updates database when transfer was successful.
121117

122118
### Status Tracking
@@ -145,10 +141,9 @@ Run status is tracked in CouchDB with events including:
145141
## Assumptions
146142

147143
- Run directories are named according to sequencer-specific ID formats (defined in run classes)
148-
- Final completion is indicated by the presence of a sequencer-specific final file (e.g., `RTAComplete.txt` for Illumina)
144+
- Final completion is indicated by the presence of a sequencer-specific final file (e.g., `CopyComplete.txt` for Illumina)
149145
- Remote storage is accessible via rsync over SSH
150-
- CouchDB is accessible and the database exists
151-
- Metadata files (e.g., RunInfo.xml) are present in run directories for status database updates and sync to metadata archive location
146+
- CouchDB is accessible and the database specified in the config exists and has a ddoc called `events` with a view called `current_status_per_runfolder` that emits a dictionary of all the statuses and their current state (true/false)
152147
- The flowcell ID is set to correspond to the ID that is scanned with a barcode scanner during sequencing setup in the lab
153148

154149
### Status Files

dataflow_transfer/run_classes/generic_runs.py

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -158,8 +158,7 @@ def has_status(self, status_name):
158158
return True if current_statuses.get(status_name) else False
159159

160160
def update_statusdb(self, status, additional_info=None):
161-
"""Update the statusdb document for this run with the given status
162-
and associated metadata files."""
161+
"""Update the statusdb document for this run with the given status."""
163162
db_doc = (
164163
self.db.get_db_doc(ddoc="lookup", view="runfolder_id", run_id=self.run_id)
165164
or {}
@@ -179,14 +178,7 @@ def update_statusdb(self, status, additional_info=None):
179178
"runfolder_id": self.run_id,
180179
"flowcell_id": self.flowcell_id,
181180
"events": [],
182-
"files": {},
183181
}
184-
files_to_include = fs.locate_metadata(
185-
self.sequencer_config.get("metadata_for_statusdb", []),
186-
self.run_dir,
187-
)
188-
parsed_files = fs.parse_metadata_files(files_to_include)
189-
db_doc["files"].update(parsed_files)
190182
db_doc["events"].append(
191183
{
192184
"event_type": status,

dataflow_transfer/tests/test_filesystem.py

Lines changed: 0 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import json
21
import os
32
import tempfile
43
from subprocess import CalledProcessError
@@ -10,8 +9,6 @@
109
check_exit_status,
1110
find_runs,
1211
get_run_dir,
13-
locate_metadata,
14-
parse_metadata_files,
1512
rsync_is_running,
1613
submit_background_process,
1714
)
@@ -84,38 +81,6 @@ def test_submit_background_process(self, mock_popen):
8481
mock_popen.assert_called_once()
8582

8683

87-
class TestParseMetadataFiles:
88-
def test_parse_json_file(self):
89-
with tempfile.TemporaryDirectory() as tmpdir:
90-
json_file = os.path.join(tmpdir, "metadata.json")
91-
with open(json_file, "w") as f:
92-
json.dump({"key": "value"}, f)
93-
metadata = parse_metadata_files([json_file])
94-
assert "metadata.json" in metadata
95-
assert metadata["metadata.json"]["key"] == "value"
96-
97-
def test_parse_xml_file(self):
98-
with tempfile.TemporaryDirectory() as tmpdir:
99-
xml_file = os.path.join(tmpdir, "metadata.xml")
100-
with open(xml_file, "w") as f:
101-
f.write("<root><key>value</key></root>")
102-
metadata = parse_metadata_files([xml_file])
103-
assert "metadata.xml" in metadata
104-
assert metadata["metadata.xml"]["root"]["key"] == "value"
105-
106-
def test_unsupported_file_type(self):
107-
with tempfile.TemporaryDirectory() as tmpdir:
108-
txt_file = os.path.join(tmpdir, "metadata.txt")
109-
with open(txt_file, "w") as f:
110-
f.write("content")
111-
metadata = parse_metadata_files([txt_file])
112-
assert "metadata.txt" not in metadata
113-
114-
def test_parse_nonexistent_file(self):
115-
metadata = parse_metadata_files(["/nonexistent/file.json"])
116-
assert metadata == {}
117-
118-
11984
class TestCheckExitStatus:
12085
def test_exit_status_zero(self):
12186
with tempfile.TemporaryDirectory() as tmpdir:
@@ -133,27 +98,3 @@ def test_exit_status_nonzero(self):
13398

13499
def test_exit_status_file_not_found(self):
135100
assert check_exit_status("/nonexistent/file") is False
136-
137-
138-
class TestLocateMetadata:
139-
def test_locate_metadata_found(self):
140-
with tempfile.TemporaryDirectory() as tmpdir:
141-
metadata_file = os.path.join(tmpdir, "metadata.json")
142-
open(metadata_file, "w").close()
143-
located = locate_metadata(["metadata.json"], tmpdir)
144-
assert len(located) == 1
145-
assert metadata_file in located
146-
147-
def test_locate_metadata_not_found(self):
148-
with tempfile.TemporaryDirectory() as tmpdir:
149-
located = locate_metadata(["nonexistent.json"], tmpdir)
150-
assert len(located) == 0
151-
152-
def test_locate_metadata_multiple_patterns(self):
153-
with tempfile.TemporaryDirectory() as tmpdir:
154-
open(os.path.join(tmpdir, "meta1.json"), "w").close()
155-
open(os.path.join(tmpdir, "meta2.json"), "w").close()
156-
located = locate_metadata(["meta1.json", "meta2.json"], tmpdir)
157-
assert len(located) == 2
158-
assert os.path.join(tmpdir, "meta1.json") in located
159-
assert os.path.join(tmpdir, "meta2.json") in located

dataflow_transfer/tests/test_run_classes.py

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ def novaseqxplus_testobj(tmp_path):
2121
"sequencers": {
2222
"NovaSeqXPlus": {
2323
"remote_destination": "/data/NovaSeqXPlus",
24-
"metadata_for_statusdb": ["RunInfo.xml", "RunParameters.xml"],
2524
"metadata_archive": "/data/metadata_archive/NovaSeqXPlus",
2625
"ignore_folders": ["nosync"],
2726
"remote_rsync_options": ["--chmod=Dg+s,g+rw"],
@@ -53,7 +52,6 @@ def nextseq_testobj(tmp_path):
5352
"sequencers": {
5453
"NextSeq": {
5554
"remote_destination": "/data/NextSeq",
56-
"metadata_for_statusdb": ["RunInfo.xml", "RunParameters.xml"],
5755
"metadata_archive": "/data/metadata_archive/NextSeq",
5856
"ignore_folders": ["nosync"],
5957
"remote_rsync_options": ["--chmod=Dg+s,g+rw"],
@@ -85,7 +83,6 @@ def miseqseq_testobj(tmp_path):
8583
"sequencers": {
8684
"MiSeq": {
8785
"remote_destination": "/data/MiSeq",
88-
"metadata_for_statusdb": ["RunInfo.xml", "RunParameters.xml"],
8986
"metadata_archive": "/data/metadata_archive/MiSeq",
9087
"ignore_folders": ["nosync"],
9188
"remote_rsync_options": ["--chmod=Dg+s,g+rw"],
@@ -117,7 +114,6 @@ def miseqseqi100_testobj(tmp_path):
117114
"sequencers": {
118115
"MiSeqi100": {
119116
"remote_destination": "/data/MiSeqi100",
120-
"metadata_for_statusdb": ["RunInfo.xml", "RunParameters.xml"],
121117
"metadata_archive": "/data/metadata_archive/MiSeqi100",
122118
"ignore_folders": ["nosync"],
123119
"remote_rsync_options": ["--chmod=Dg+s,g+rw"],
@@ -469,16 +465,6 @@ def get_db_doc(self, ddoc, view, run_id):
469465
def update_db_doc(self, doc):
470466
self.updated_doc = doc
471467

472-
import dataflow_transfer.utils.filesystem as fs
473-
474-
def mock_locate_metadata(metadata_list, run_dir):
475-
return []
476-
477-
def mock_parse_metadata_files(files):
478-
return {}
479-
480-
fs.locate_metadata = mock_locate_metadata
481-
fs.parse_metadata_files = mock_parse_metadata_files
482468
mock_db = MockDB()
483469
run_obj.db = mock_db
484470
run_obj.update_statusdb(status=status_to_update)
Lines changed: 0 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,7 @@
1-
import json
21
import logging
32
import os
43
import subprocess
54

6-
import xmltodict
7-
85
logger = logging.getLogger(__name__)
96

107

@@ -44,31 +41,6 @@ def submit_background_process(command_str: str):
4441
subprocess.Popen(command_str, stdout=subprocess.PIPE, shell=True)
4542

4643

47-
def parse_metadata_files(files):
48-
"""Given a list of files, read the content into a dict.
49-
Handle .json and .xml files differently."""
50-
metadata = {}
51-
for file_path in files:
52-
try:
53-
if file_path.endswith(".json"):
54-
with open(file_path) as f:
55-
metadata[os.path.basename(file_path)] = json.load(f)
56-
elif file_path.endswith(".xml"):
57-
with open(file_path) as f:
58-
xml_content = xmltodict.parse(
59-
f.read(), attr_prefix="", cdata_key="text"
60-
)
61-
metadata[os.path.basename(file_path)] = xml_content
62-
else:
63-
logger.warning(
64-
f"Unsupported metadata file type for {file_path}. Only .json and .xml are supported."
65-
)
66-
continue
67-
except Exception as e:
68-
logger.error(f"Error reading metadata file {file_path}: {e}")
69-
return metadata
70-
71-
7244
def check_exit_status(file_path):
7345
"""Check the exit status from a given file.
7446
Return True if exit code is 0, else False."""
@@ -78,13 +50,3 @@ def check_exit_status(file_path):
7850
if exit_code == "0":
7951
return True
8052
return False
81-
82-
83-
def locate_metadata(metadata_list, run_dir):
84-
"""Locate metadata in the given run directory."""
85-
located_paths = []
86-
for pattern in metadata_list:
87-
metadata_path = os.path.join(run_dir, pattern)
88-
if os.path.exists(metadata_path):
89-
located_paths.append(metadata_path)
90-
return located_paths

pyproject.toml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ ignore = [
2020

2121
[project]
2222
name = "dataflow_transfer"
23-
version = "1.1.2"
23+
version = "1.1.3"
2424
description = "Script for transferring sequencing data from sequencers to storage"
2525
authors = [
2626
{ name = "Sara Sjunnebo", email = "sara.sjunnebo@scilifelab.se" },
@@ -31,7 +31,6 @@ requires-python = ">=3.11"
3131
dependencies = [
3232
"click",
3333
"PyYAML",
34-
"xmltodict",
3534
"ibmcloudant",
3635
]
3736

@@ -51,4 +50,4 @@ requires = ["setuptools>=68", "wheel"]
5150
build-backend = "setuptools.build_meta"
5251

5352
[tool.setuptools.packages.find]
54-
where = ["."]
53+
where = ["."]

requirements.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
11
PyYAML
22
click
3-
xmltodict
43
ibmcloudant

0 commit comments

Comments
 (0)