Skip to content

Commit 6048f72

Browse files
committed
fix: validate locus tag for bakta annotation; closes #8
1 parent 20e70b2 commit 6048f72

2 files changed

Lines changed: 27 additions & 1 deletion

File tree

workflow/rules/annotate.smk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ rule annotate_bakta:
162162
"""--- Running BAKTA annotation for sample {wildcards.sample} ---"""
163163
params:
164164
prefix=lambda wc: wc.sample,
165-
locustag=lambda wc: samples.loc[wc.sample]["id_prefix"],
165+
locustag=lambda wc: format_bakta_locustag(samples.loc[wc.sample]["id_prefix"]),
166166
species=lambda wc: samples.loc[wc.sample]["species"],
167167
strain=lambda wc: samples.loc[wc.sample]["strain"],
168168
outdir=lambda wc, output: os.path.dirname(output[0]),

workflow/rules/common.smk

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# import basic packages
22
import pandas as pd
3+
import re
4+
from snakemake import logging
35
from snakemake.utils import validate
46

57

@@ -62,3 +64,27 @@ def get_final_input(wildcards):
6264
tool=config["tool"],
6365
)
6466
return inputs
67+
68+
69+
# -----------------------------------------------------
70+
# helper functions
71+
# -----------------------------------------------------
72+
def format_bakta_locustag(raw):
73+
"""Format locustag for BAKTA annotation."""
74+
tag = str(raw)
75+
# uppercase for BAKTA
76+
tag_up = tag.upper()
77+
# keep only A-Z0-9
78+
cleaned = re.sub(r"[^A-Z0-9]", "", tag_up)
79+
if len(cleaned) < 3 or len(cleaned) > 12:
80+
raise ValueError(
81+
f"locustag '{raw}' -> '{cleaned}' must contain between 3-12 alphanumeric uppercase characters\n"
82+
)
83+
if not re.match(r"^[A-Z]", cleaned):
84+
raise ValueError(f"locustag '{raw}' -> '{cleaned}' must start with a letter")
85+
# warn if cleaned tag is different from original
86+
if cleaned != tag:
87+
logger.warning(
88+
f"\nlocustag '{raw}' converted to '{cleaned}' to meet BAKTA requirements (between 3 and 12 alphanumeric uppercase characters, start with a letter)\n"
89+
)
90+
return cleaned

0 commit comments

Comments
 (0)