Skip to content

Commit 47b25bb

Browse files
authored
Merge pull request #18 from ssjunnebo/centralise_regex
Centralise regex
2 parents 2aa4990 + ea81a1a commit 47b25bb

8 files changed

Lines changed: 90 additions & 36 deletions

File tree

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,8 @@ Run status is tracked in CouchDB with events including:
143143
- Run directories are named according to sequencer-specific ID formats (defined in run classes)
144144
- Final completion is indicated by the presence of a sequencer-specific final file (e.g., `CopyComplete.txt` for Illumina)
145145
- Remote storage is accessible via rsync over SSH
146-
- CouchDB is accessible and the database specified in the config exists and has a ddoc called `events` with a view called `current_status_per_runfolder` that emits a dictionary of all the statuses and their current state (true/false)
146+
- CouchDB is accessible and the database specified in the config exists and has a ddoc called `events` with a view called `current_status_per_runfolder` that emits a dictionary of all the statuses and their current state (true/false)
147+
- CouchDB is accessible and the database `gs_configs` contains a document called `regex_patterns` containing the regexes used to identify different run types.
147148
- The flowcell ID is set to correspond to the ID that is scanned with a barcode scanner during sequencing setup in the lab
148149

149150
### Status Files
@@ -200,3 +201,4 @@ To add support for a new sequencer, add the following to dataflow_transfer:
200201
2. Import the new class in `dataflow_transfer/run_classes/__init__.py`
201202
3. Add a test fixture for the new run in `dataflow_transfer/tests/test_run_classes.py` and include it in the relevant tests
202203
4. Add a section for the sequencer in the config file
204+
5. Add a regular expression matching the run folder name of the sequencer in the CouchDB database `gs_configs`, in the document called `regex_patterns`.

dataflow_transfer/run_classes/element_runs.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
class ElementRun(Run):
77
"""Defines an Element sequencing run"""
88

9+
run_family = "Element"
10+
911
def __init__(self, run_dir, configuration):
1012
super().__init__(run_dir, configuration)
1113
self.final_file = "RunUploaded.json"
@@ -18,11 +20,7 @@ class AVITIRun(ElementRun):
1820
run_type = "AVITI"
1921

2022
def __init__(self, run_dir, configuration):
21-
self.run_id_format = (
22-
r"^\d{8}_AV\d{6}_(A|B)\d{10}$" # 20251007_AV242106_A2507535225
23-
)
2423
super().__init__(run_dir, configuration)
25-
self.flowcell_id = self.run_id.split("_")[-1][1:] # 2507535225
2624

2725

2826
# TODO: Add Teton run class

dataflow_transfer/run_classes/generic_runs.py

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@
1212
class Run:
1313
"""Defines a generic sequencing run"""
1414

15+
run_type = None
16+
run_family = None
17+
1518
def __init__(self, run_dir, configuration):
1619
self.run_dir = run_dir
1720
self.run_id = os.path.basename(run_dir)
@@ -33,6 +36,27 @@ def __init__(self, run_dir, configuration):
3336
)
3437
self.remote_destination = self.sequencer_config.get("remote_destination")
3538
self.db = StatusdbSession(self.configuration.get("statusdb"))
39+
self.run_id_format = self._resolve_run_id_format()
40+
self.flowcell_id = (
41+
re.match(self.run_id_format, self.run_id).group("flowcell_id")
42+
if self.run_id_format
43+
else None
44+
)
45+
46+
def _resolve_run_id_format(self):
47+
"""Resolve the run ID regex from central config."""
48+
run_id_format = None
49+
if self.run_family and self.run_type:
50+
try:
51+
run_id_format = self.db.get_regex_pattern(
52+
self.run_family, self.run_type
53+
)
54+
except Exception as exc:
55+
logger.warning(
56+
f"Unable to load run_id_format for {self.run_type} from regex config: {exc}"
57+
)
58+
59+
return run_id_format
3660

3761
def confirm_run_type(self):
3862
"""Compare run ID with expected format for the run type."""
@@ -159,10 +183,13 @@ def has_status(self, status_name):
159183

160184
def update_statusdb(self, status, additional_info=None):
161185
"""Update the statusdb document for this run with the given status."""
162-
db_doc = (
163-
self.db.get_db_doc(ddoc="lookup", view="runfolder_id", run_id=self.run_id)
164-
or {}
186+
doc_id = self.db.get_doc_id(
187+
ddoc="lookup", view="runfolder_id", run_id=self.run_id
165188
)
189+
if doc_id:
190+
db_doc = self.db.get_document(db=self.db.db_name, doc_id=doc_id)
191+
else:
192+
db_doc = {}
166193

167194
statuses_to_only_update_once = [
168195
"sequencing_started",

dataflow_transfer/run_classes/illumina_runs.py

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,11 @@
66
class IlluminaRun(Run):
77
"""Defines an Illumina sequencing run"""
88

9+
run_family = "Illumina"
10+
911
def __init__(self, run_dir, configuration):
1012
super().__init__(run_dir, configuration)
1113
self.final_file = "CopyComplete.txt"
12-
self.flowcell_id = self.run_id.split("_")[-1]
1314

1415

1516
@register_run_class
@@ -19,11 +20,7 @@ class NovaSeqXPlusRun(IlluminaRun):
1920
run_type = "NovaSeqXPlus"
2021

2122
def __init__(self, run_dir, configuration):
22-
self.run_id_format = (
23-
r"^\d{8}_[A-Z0-9]+_\d{4}_[A-Z0-9]+$" # 20251010_LH00202_0284_B22CVHTLT1
24-
)
2523
super().__init__(run_dir, configuration)
26-
self.flowcell_id = self.run_id.split("_")[-1][1:] # 22CVHTLT1
2724

2825

2926
@register_run_class
@@ -33,9 +30,6 @@ class NextSeqRun(IlluminaRun):
3330
run_type = "NextSeq"
3431

3532
def __init__(self, run_dir, configuration):
36-
self.run_id_format = (
37-
r"^\d{6}_[A-Z0-9]+_\d{3}_[A-Z0-9]+$" # 251015_VH00203_572_AAHFHCCM5
38-
)
3933
super().__init__(run_dir, configuration)
4034

4135

@@ -46,9 +40,6 @@ class MiSeqRun(IlluminaRun):
4640
run_type = "MiSeq"
4741

4842
def __init__(self, run_dir, configuration):
49-
self.run_id_format = (
50-
r"^\d{6}_[A-Z0-9]+_\d{4}_[A-Z0-9\-]+$" # 251015_M01548_0646_000000000-M6D7K
51-
)
5243
super().__init__(run_dir, configuration)
5344

5445

@@ -59,6 +50,4 @@ class MiSeqi100Run(IlluminaRun):
5950
run_type = "MiSeqi100"
6051

6152
def __init__(self, run_dir, configuration):
62-
self.run_id_format = r"^\d{8}_[A-Z0-9]+_\d{4}_[A-Z0-9]{10}-SC3$" # 20260128_SH01140_0002_ASC2150561-SC3
6353
super().__init__(run_dir, configuration)
64-
self.flowcell_id = self.run_id.split("_")[-1][1:] # SC2150561-SC3

dataflow_transfer/run_classes/ont_runs.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,11 @@
66
class ONTRun(Run):
77
"""Defines a ONT sequencing run"""
88

9+
run_family = "ONT"
10+
911
def __init__(self, run_dir, configuration):
1012
super().__init__(run_dir, configuration)
1113
self.final_file = "final_summary.txt"
12-
self.flowcell_id = self.run_id.split("_")[-2]
1314

1415

1516
@register_run_class
@@ -19,7 +20,6 @@ class PromethIONRun(ONTRun):
1920
run_type = "PromethION"
2021

2122
def __init__(self, run_dir, configuration):
22-
self.run_id_format = r"^\d{8}_\d{4}_[A-Z0-9]{2}_P[A-Z0-9]+_[a-f0-9]{8}$" # 20251015_1051_3B_PBG60686_0af3a2e0
2323
super().__init__(run_dir, configuration)
2424

2525

@@ -30,5 +30,4 @@ class MinIONRun(ONTRun):
3030
run_type = "MinION"
3131

3232
def __init__(self, run_dir, configuration):
33-
self.run_id_format = r"^\d{8}_\d{4}_MN[A-Z0-9]+_[A-Z0-9]+_[a-f0-9]{8}$" # 20240229_1404_MN19414_ASH657_7a74bf8f
3433
super().__init__(run_dir, configuration)

dataflow_transfer/tests/test_run_classes.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,26 @@ def __init__(self, config):
140140
def get_db_doc(self, ddoc, view, run_id):
141141
return None
142142

143+
def get_regex_pattern(self, run_family, run_type):
144+
if run_family == "Illumina":
145+
if run_type == "NovaSeqXPlus": # 20251010_LH00202_0284_B22CVHTLT1
146+
return r"^(?P<date>\d{8})_(?P<instrument>[A-Z0-9]+)_\d{4}_(?P<position>(A|B))(?P<flowcell_id>[A-Z0-9]+)$"
147+
elif run_type == "NextSeq": # 251015_VH00203_572_AAHFHCCM5
148+
return r"^(?P<date>\d{6})_(?P<instrument>[A-Z0-9]+)_\d{3}_(?P<flowcell_id>[A-Z0-9]+)$"
149+
elif run_type == "MiSeq": # 251015_M01548_0646_000000000-M6D7K
150+
return r"^(?P<date>\d{6})_(?P<instrument>[A-Z0-9]+)_\d{4}_(?P<flowcell_id>[A-Z0-9\-]+)$"
151+
elif run_type == "MiSeqi100": # 20260128_SH01140_0002_ASC2150561-SC3
152+
return r"^(?P<date>\d{8})_(?P<instrument>[A-Z0-9]+)_\d{4}_A(?P<flowcell_id>[A-Z0-9]{9}-SC3)$"
153+
elif run_family == "ONT":
154+
if run_type == "PromethION": # 20251015_1051_3B_PBG60686_0af3a2e0
155+
return r"^(?P<date>\d{8})_(?P<time>\d{4})_(?P<position>[A-Z0-9]{2})_(?P<flowcell_id>P[A-Z0-9]+)_(?P<run_hash>[a-f0-9]{8})$"
156+
elif run_type == "MinION": # 20240229_1404_MN19414_ASH657_7a74bf8f
157+
return r"^(?P<date>\d{8})_(?P<time>\d{4})_(?P<position>MN[A-Z0-9]+)_(?P<flowcell_id>[A-Z0-9]+)_(?P<run_hash>[a-f0-9]{8})$"
158+
elif run_family == "Element":
159+
if run_type == "AVITI": # 20251007_AV242106_A2507535225
160+
return r"^(?P<date>\d{8})_(?P<instrument>AV\d{6})_(?P<position>(A|B))(?P<flowcell_id>\d{10})$"
161+
return None
162+
143163
def update_db_doc(self, doc):
144164
pass
145165

@@ -458,8 +478,12 @@ def test_update_statusdb(
458478
class MockDB:
459479
def __init__(self):
460480
self.updated_doc = None
481+
self.db_name = "mock_db"
461482

462-
def get_db_doc(self, ddoc, view, run_id):
483+
def get_doc_id(self, ddoc, view, run_id):
484+
return "mock_doc_id"
485+
486+
def get_document(self, db, doc_id):
463487
return {"events": existing_statuses, "files": {}}
464488

465489
def update_db_doc(self, doc):

dataflow_transfer/utils/statusdb.py

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -55,16 +55,31 @@ def _retry_call(self, func):
5555
# re-raise last exception for caller to handle
5656
raise last_exception
5757

58-
def get_db_doc(self, ddoc, view, run_id):
59-
"""Retrieve a document from the database via retried call."""
60-
doc_id = self.get_doc_id(ddoc, view, run_id)
61-
if doc_id:
62-
return self._retry_call(
63-
lambda: self.connection.get_document(
64-
db=self.db_name, doc_id=doc_id
65-
).get_result()
66-
)
67-
return None
58+
def get_document(self, db, doc_id):
59+
"""Retrieve a document from any database via retried call."""
60+
return self._retry_call(
61+
lambda: self.connection.get_document(db=db, doc_id=doc_id).get_result()
62+
)
63+
64+
def get_regex_pattern(
65+
self,
66+
run_family,
67+
run_type,
68+
regex_db="gs_configs",
69+
regex_doc_id="regex_patterns",
70+
):
71+
"""Lookup the python regex pattern for a run type from the central regex config document."""
72+
regex_doc = self.get_document(db=regex_db, doc_id=regex_doc_id)
73+
if not regex_doc:
74+
return None
75+
76+
flowcell_patterns = regex_doc.get("flowcell_patterns", {})
77+
family_patterns = flowcell_patterns.get(run_family, {})
78+
if not family_patterns:
79+
return None
80+
81+
pattern = family_patterns.get(run_type)
82+
return pattern
6883

6984
def get_doc_id(self, ddoc, view, run_id):
7085
"""Retrieve a document ID from the database via retried call."""

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ ignore = [
2020

2121
[project]
2222
name = "dataflow_transfer"
23-
version = "1.1.3"
23+
version = "1.1.4"
2424
description = "Script for transferring sequencing data from sequencers to storage"
2525
authors = [
2626
{ name = "Sara Sjunnebo", email = "sara.sjunnebo@scilifelab.se" },

0 commit comments

Comments
 (0)