From c60cfde51817d618374efa33f7e59e1e7f5867c5 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Fri, 22 May 2026 11:16:50 +0200 Subject: [PATCH 01/17] fix(kpplot): add explicit re-export for KpPlotBase in init --- krakenparser/kpplot/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/krakenparser/kpplot/__init__.py b/krakenparser/kpplot/__init__.py index 147afc0..c353a3e 100644 --- a/krakenparser/kpplot/__init__.py +++ b/krakenparser/kpplot/__init__.py @@ -1 +1,3 @@ from .base import KpPlotBase + +__all__ = ["KpPlotBase"] From ea101872c108512f660a2885b101fc8ce066e511 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Fri, 22 May 2026 11:17:11 +0200 Subject: [PATCH 02/17] ci: migrate workflow from flake8 to ruff --- .github/workflows/python-package.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 553faa6..fe4fe56 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -27,14 +27,14 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install flake8 setuptools wheel + python -m pip install ruff setuptools wheel pip install -e ".[dev]" --no-build-isolation - - name: Lint with flake8 + - name: Lint with ruff run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + # Run linter and fail on any rule violation + ruff check . + # Check if the code format matches Ruff style guide + ruff format --check . - name: Test with pytest run: | pytest --cov=krakenparser --cov-report=xml From 28eed0799f4a0db35faa5dcd5ba469d847386b00 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Fri, 22 May 2026 11:17:37 +0200 Subject: [PATCH 03/17] style(script): apply Ruff import sorting --- krakenparser/pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/krakenparser/pipeline.py b/krakenparser/pipeline.py index c54b4e9..768b5d8 100644 --- a/krakenparser/pipeline.py +++ b/krakenparser/pipeline.py @@ -7,8 +7,6 @@ import sys from pathlib import Path -_log = logging.getLogger(__name__) - import pandas as pd from krakenparser.counts.convert2csv import convert_to_csv @@ -19,6 +17,8 @@ from krakenparser.stats.diversity import calc_alpha_div, calc_beta_div from krakenparser.stats.relabund import calculate_rel_abund +_log = logging.getLogger(__name__) + def _is_processable(path: Path) -> bool: """Return False for hidden files, files with null bytes, or non-UTF-8 files.""" From 15267a471d18a9f4c95a8aa907c8b6ef12c374b8 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Fri, 22 May 2026 11:41:53 +0200 Subject: [PATCH 04/17] refactor(types): fix Pylance warnings in calc_beta_div --- krakenparser/stats/diversity.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/krakenparser/stats/diversity.py b/krakenparser/stats/diversity.py index dd76098..ad49651 100644 --- a/krakenparser/stats/diversity.py +++ b/krakenparser/stats/diversity.py @@ -61,30 +61,31 @@ def calc_alpha_div(df, output_path): def calc_beta_div(df, output_path, rarefaction_depth, seed=None): rng = np.random.default_rng(seed) - rarefied_counts = [] - sample_ids = [] + rarefied_counts: list[np.ndarray] = [] + sample_ids: list[str] = [] for sample, row in df.iterrows(): counts = np.round(row.values).astype(int) if counts.sum() >= rarefaction_depth: rarefied = _subsample_counts(counts, n=rarefaction_depth, rng=rng) rarefied_counts.append(rarefied) - sample_ids.append(sample) + sample_ids.append(str(sample)) if len(rarefied_counts) < 2: raise ValueError("Not enough samples passed the rarefaction threshold.") X = np.array(rarefied_counts, dtype=float) + idx = pd.Index(sample_ids) bray_df = pd.DataFrame( squareform(pdist(X, metric="braycurtis")), - index=sample_ids, - columns=sample_ids, + index=idx, + columns=idx, ) jaccard_df = pd.DataFrame( squareform(pdist(X.astype(bool).astype(float), metric="jaccard")), - index=sample_ids, - columns=sample_ids, + index=idx, + columns=idx, ) bray_df.to_csv(output_path / "beta_div_bray.csv") From 26afe67ded0cf9f18f0ea0d4e4b14cd1ecfd0d59 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Fri, 22 May 2026 15:10:44 +0200 Subject: [PATCH 05/17] feat(utils): add ensure_output_dir helper --- krakenparser/utils.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 krakenparser/utils.py diff --git a/krakenparser/utils.py b/krakenparser/utils.py new file mode 100644 index 0000000..7f9a780 --- /dev/null +++ b/krakenparser/utils.py @@ -0,0 +1,10 @@ +# krakenparser/utils.py +from pathlib import Path + + +def ensure_output_dir(path: str | Path, is_file: bool = True) -> Path: + """Create parent directory for a file output, or the directory itself.""" + p = Path(path) + target = p.parent if is_file else p + target.mkdir(parents=True, exist_ok=True) + return p From 399380630960a6925db4e7e25a6fd42dc2d9bd5c Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Fri, 22 May 2026 15:11:14 +0200 Subject: [PATCH 06/17] refactor(mpa): use ensure_output_dir, add logging and input validation --- krakenparser/mpa/mpa_table.py | 11 ++++++++++- krakenparser/mpa/transform2mpa.py | 13 +++++++++++-- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/krakenparser/mpa/mpa_table.py b/krakenparser/mpa/mpa_table.py index c808383..2837972 100644 --- a/krakenparser/mpa/mpa_table.py +++ b/krakenparser/mpa/mpa_table.py @@ -3,17 +3,25 @@ import argparse import logging +from pathlib import Path + +from krakenparser.utils import ensure_output_dir _log = logging.getLogger(__name__) def combine_mpa(in_files: list[str], o_file: str) -> None: + out_path = ensure_output_dir(o_file, is_file=True) # Plain dict preserves insertion order (Python 3.7+). taxa: dict[str, dict[int, str]] = {} sample_names: list[str] = [] _log.info("Number of files to parse: %d", len(in_files)) + for in_path in in_files: + if not Path(in_path).is_file(): + raise FileNotFoundError(f"Input file not found: {in_path}") + for idx, in_path in enumerate(in_files): sample_name = f"Sample #{idx + 1}" with open(in_path) as fh: @@ -39,7 +47,7 @@ def combine_mpa(in_files: list[str], o_file: str) -> None: n_taxa = len(taxa) _log.info("Number of classifications to write: %d", n_taxa) - with open(o_file, "w") as fh: + with open(out_path, "w") as fh: fh.write("#Classification\t" + "\t".join(sample_names) + "\n") for taxon, counts in taxa.items(): row = [counts.get(i, "0") for i in range(n_samples)] @@ -49,6 +57,7 @@ def combine_mpa(in_files: list[str], o_file: str) -> None: def main() -> None: + logging.basicConfig(level=logging.INFO, format="%(message)s") parser = argparse.ArgumentParser( description="Combine MPA files into a single tab-delimited table." ) diff --git a/krakenparser/mpa/transform2mpa.py b/krakenparser/mpa/transform2mpa.py index a28cde6..8d362f7 100644 --- a/krakenparser/mpa/transform2mpa.py +++ b/krakenparser/mpa/transform2mpa.py @@ -2,10 +2,13 @@ """Convert a Kraken2 report to MetaPhlAn (MPA) format.""" import argparse +import logging import os import sys from pathlib import Path +from krakenparser.utils import ensure_output_dir + # Maps Kraken2 single-letter rank codes to MPA prefixes _RANK_PREFIX = { "D": "d", @@ -18,6 +21,8 @@ "S": "s", } +_log = logging.getLogger(__name__) + def _parse_line(line: str): """ @@ -67,10 +72,13 @@ def kreport_to_mpa( depth d is encountered, all stack entries with depth >= d are popped before the new entry is pushed, keeping the path consistent. """ + if not Path(report_path).is_file(): + raise FileNotFoundError(f"Input file not found: {report_path}") + out_path = ensure_output_dir(output_path, is_file=True) # Stack entries: (structural_depth, mpa_segment, is_standard_rank) stack: list[tuple[int, str, bool]] = [] - with open(report_path) as r_fh, open(output_path, "w") as o_fh: + with open(report_path) as r_fh, open(out_path, "w") as o_fh: if display_header: o_fh.write("#Classification\t" + os.path.basename(report_path) + "\n") @@ -110,6 +118,7 @@ def kreport_to_mpa( def main() -> None: + logging.basicConfig(level=logging.INFO, format="%(message)s") parser = argparse.ArgumentParser( description="Convert a Kraken2 report to MetaPhlAn (MPA) format." ) @@ -203,7 +212,7 @@ def main() -> None: continue out_name = f.name.replace(".kreport", ".MPA.TXT") kreport_to_mpa(str(f), str(output_dir / out_name), **kwargs) - print(f"Converted to MPA successfully. Output stored in {output_dir}") + _log.info(f"Converted to MPA successfully. Output stored in {output_dir}") else: kreport_to_mpa(args.r_file, args.o_file, **kwargs) From 40e401d8221d7dd2771a8e6e635d856376e97272 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Fri, 22 May 2026 15:11:34 +0200 Subject: [PATCH 07/17] refactor(counts): use ensure_output_dir and extract main() --- krakenparser/counts/convert2csv.py | 17 +++++++++-------- krakenparser/counts/split_mpa.py | 7 +++++-- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/krakenparser/counts/convert2csv.py b/krakenparser/counts/convert2csv.py index 1177b91..9fb763e 100755 --- a/krakenparser/counts/convert2csv.py +++ b/krakenparser/counts/convert2csv.py @@ -6,6 +6,8 @@ import pandas as pd +from krakenparser.utils import ensure_output_dir + _log = logging.getLogger(__name__) @@ -13,17 +15,15 @@ def convert_to_csv(input_file, output_file): in_path = Path(input_file) if not in_path.is_file(): raise FileNotFoundError(f"Input file not found: {in_path}") - out_path = Path(output_file) - if not out_path.parent.exists(): - raise FileNotFoundError(f"Output directory does not exist: {out_path.parent}") + out_path = ensure_output_dir(output_file, is_file=True) data = pd.read_csv(in_path, sep="\t", index_col=0) data.T.to_csv(out_path, index_label="Sample_id") _log.info("Data converted and saved as '%s'.", output_file) -if __name__ == "__main__": - # Use argparse to handle command-line arguments +def main() -> None: + logging.basicConfig(level=logging.INFO, format="%(message)s") parser = argparse.ArgumentParser( description="Reads a TXT file, reorganizes the data, and converts it into a CSV file." ) @@ -39,8 +39,9 @@ def convert_to_csv(input_file, output_file): required=True, help="Path to the output CSV file. The script will restructure the data and save it here.", ) - args = parser.parse_args() - - # Call function with parsed arguments convert_to_csv(args.input, args.output) + + +if __name__ == "__main__": + main() diff --git a/krakenparser/counts/split_mpa.py b/krakenparser/counts/split_mpa.py index 0ee87b8..1af110e 100644 --- a/krakenparser/counts/split_mpa.py +++ b/krakenparser/counts/split_mpa.py @@ -9,6 +9,8 @@ import re from pathlib import Path +from krakenparser.utils import ensure_output_dir + _log = logging.getLogger(__name__) @@ -53,8 +55,8 @@ def split_mpa( in_path = Path(input_file) if not in_path.is_file(): raise FileNotFoundError(f"Input file not found: {in_path}") - out_path = Path(output_dir) - (out_path / "txt").mkdir(parents=True, exist_ok=True) + out_path = ensure_output_dir(output_dir, is_file=False) + (out_path / "txt").mkdir(exist_ok=True) lines = in_path.read_text().splitlines() data_lines = [ln for ln in lines if not ln.startswith("#") and ln.strip()] @@ -86,6 +88,7 @@ def split_mpa( def main() -> None: + logging.basicConfig(level=logging.INFO, format="%(message)s") parser = argparse.ArgumentParser( description="Split a combined MPA table into per-rank TXT files." ) From 9e2d738502644b6a3badb754e7b4af359e18b4a3 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Fri, 22 May 2026 15:11:51 +0200 Subject: [PATCH 08/17] refactor(counts): extract main(), add logging, restore dest-file validation --- krakenparser/counts/processing_script.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/krakenparser/counts/processing_script.py b/krakenparser/counts/processing_script.py index 831910e..0d6d59a 100755 --- a/krakenparser/counts/processing_script.py +++ b/krakenparser/counts/processing_script.py @@ -1,10 +1,13 @@ #!/usr/bin/env python import argparse +import logging import os import tempfile from pathlib import Path +_log = logging.getLogger(__name__) + def modify_taxa_names(line): prefixes = ["s__", "g__", "f__", "o__", "c__", "p__"] @@ -47,11 +50,11 @@ def process_files(source_file, destination_file): tmp_path = tmp.name os.replace(tmp_path, dest_path) - print(f"Processed {destination_file} successfully.") + _log.info(f"Processed {destination_file} successfully.") -if __name__ == "__main__": - # Use argparse to parse command-line arguments +def main() -> None: + logging.basicConfig(level=logging.INFO, format="%(message)s") parser = argparse.ArgumentParser( description="Reads a source file, processes its first line, modifies taxa names in a destination file, and updates it." ) @@ -67,8 +70,9 @@ def process_files(source_file, destination_file): required=True, help="Path to the destination file. This file's contents will be updated with cleaned taxa names.", ) - args = parser.parse_args() - - # Call the function with parsed arguments process_files(args.input, args.output) + + +if __name__ == "__main__": + main() From c320f0f1139218b12d111354691675589cbc628b Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Fri, 22 May 2026 15:12:10 +0200 Subject: [PATCH 09/17] refactor(stats): use ensure_output_dir, add logging and startup info --- krakenparser/stats/diversity.py | 34 ++++++++++++++++++++++++++------- krakenparser/stats/relabund.py | 15 ++++++++++----- 2 files changed, 37 insertions(+), 12 deletions(-) diff --git a/krakenparser/stats/diversity.py b/krakenparser/stats/diversity.py index ad49651..dc39eb4 100644 --- a/krakenparser/stats/diversity.py +++ b/krakenparser/stats/diversity.py @@ -1,6 +1,7 @@ #!/usr/bin/env python import argparse +import logging import sys from pathlib import Path @@ -8,6 +9,10 @@ import pandas as pd from scipy.spatial.distance import pdist, squareform +from krakenparser.utils import ensure_output_dir + +_log = logging.getLogger(__name__) + def shannon_index(counts): counts = np.array(counts) @@ -44,6 +49,7 @@ def _subsample_counts( def calc_alpha_div(df, output_path): + out_path = ensure_output_dir(output_path, is_file=False) results = [] for sample_id, row in df.iterrows(): counts = row.values @@ -56,10 +62,15 @@ def calc_alpha_div(df, output_path): } ) alpha_df = pd.DataFrame(results).set_index("Sample") - alpha_df.to_csv(output_path / "alpha_div.csv") + alpha_df.to_csv(out_path / "alpha_div.csv") + + _log.info( + f"α-diversity has been successfully calculated and saved to '{output_path}'." + ) def calc_beta_div(df, output_path, rarefaction_depth, seed=None): + out_path = ensure_output_dir(output_path, is_file=False) rng = np.random.default_rng(seed) rarefied_counts: list[np.ndarray] = [] sample_ids: list[str] = [] @@ -88,11 +99,16 @@ def calc_beta_div(df, output_path, rarefaction_depth, seed=None): columns=idx, ) - bray_df.to_csv(output_path / "beta_div_bray.csv") - jaccard_df.to_csv(output_path / "beta_div_jaccard.csv") + bray_df.to_csv(out_path / "beta_div_bray.csv") + jaccard_df.to_csv(out_path / "beta_div_jaccard.csv") + _log.info( + f"β-diversity has been successfully calculated and saved to '{output_path}'." + ) -if __name__ == "__main__": + +def main() -> None: + logging.basicConfig(level=logging.INFO, format="%(message)s") parser = argparse.ArgumentParser(description="Calculate α & β-diversities.") parser.add_argument( "-i", @@ -117,6 +133,9 @@ def calc_beta_div(df, output_path, rarefaction_depth, seed=None): ) args = parser.parse_args() + seed_label = str(args.seed) if args.seed is not None else "not set (results will vary between runs)" + _log.info("Rarefaction depth: %d | seed: %s", args.depth, seed_label) + input_file = Path(args.input) if not input_file.is_file(): sys.exit(f"Error: input file not found: {input_file}") @@ -127,6 +146,7 @@ def calc_beta_div(df, output_path, rarefaction_depth, seed=None): calc_alpha_div(df, output_dir) calc_beta_div(df, output_dir, args.depth, seed=args.seed) - print( - f"α & β-diversities have been successfully calculated and saved to '{output_dir}'." - ) + + +if __name__ == "__main__": + main() diff --git a/krakenparser/stats/relabund.py b/krakenparser/stats/relabund.py index 5b1b0c5..0957329 100644 --- a/krakenparser/stats/relabund.py +++ b/krakenparser/stats/relabund.py @@ -7,6 +7,8 @@ import pandas as pd +from krakenparser.utils import ensure_output_dir + _log = logging.getLogger(__name__) @@ -14,9 +16,7 @@ def calculate_rel_abund(input_file, output_file, other_threshold=None): in_path = Path(input_file) if not in_path.is_file(): raise FileNotFoundError(f"Input file not found: {in_path}") - out_path = Path(output_file) - if not out_path.parent.exists(): - raise FileNotFoundError(f"Output directory does not exist: {out_path.parent}") + out_path = ensure_output_dir(output_file, is_file=True) # Load counts table df = pd.read_csv(in_path) @@ -59,11 +59,12 @@ def calculate_rel_abund(input_file, output_file, other_threshold=None): ) # Save to CSV - result.to_csv(output_file, index=False) + result.to_csv(out_path, index=False) _log.info("Relative abundance saved as '%s'.", output_file) -if __name__ == "__main__": +def main() -> None: + logging.basicConfig(level=logging.INFO, format="%(message)s") parser = argparse.ArgumentParser( description="Calculates taxa relative abundance and saves it to a CSV file." ) @@ -81,3 +82,7 @@ def calculate_rel_abund(input_file, output_file, other_threshold=None): args = parser.parse_args() calculate_rel_abund(args.input, args.output, args.other) + + +if __name__ == "__main__": + main() From 17393de5d31573befc477ae0e4c4400a8664db03 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Fri, 22 May 2026 15:12:27 +0200 Subject: [PATCH 10/17] refactor(pipeline): raise exceptions instead of sys.exit in run_pipeline --- krakenparser/pipeline.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/krakenparser/pipeline.py b/krakenparser/pipeline.py index 768b5d8..4f51e10 100644 --- a/krakenparser/pipeline.py +++ b/krakenparser/pipeline.py @@ -47,7 +47,7 @@ def run_pipeline( ) -> None: source_dir = Path(input_dir) if not source_dir.is_dir(): - sys.exit(f"Error: input directory not found: {source_dir}") + raise FileNotFoundError(f"Input directory not found: {source_dir}") out_dir = Path(output_dir) if output_dir else source_dir.parent out_dir.mkdir(parents=True, exist_ok=True) @@ -55,8 +55,8 @@ def run_pipeline( existing = [out_dir / d for d in _OUTPUT_SUBDIRS if (out_dir / d).exists()] if existing and not overwrite: names = ", ".join(d.name for d in existing) - sys.exit( - f"Error: output already exists in '{out_dir}' ({names}).\n" + raise FileExistsError( + f"Output already exists in '{out_dir}' ({names}).\n" "Use --overwrite to overwrite it." ) if overwrite: @@ -164,14 +164,17 @@ def main() -> None: help="Overwrite the output directory if it already exists", ) args = parser.parse_args() - run_pipeline( - args.input, - args.output, - keep_human=args.keep_human, - rarefaction_depth=args.depth, - seed=args.seed, - overwrite=args.overwrite, - ) + try: + run_pipeline( + args.input, + args.output, + keep_human=args.keep_human, + rarefaction_depth=args.depth, + seed=args.seed, + overwrite=args.overwrite, + ) + except (FileNotFoundError, FileExistsError) as e: + sys.exit(f"Error: {e}") if __name__ == "__main__": From 2f51ffbb1fff8612eac4d80ac62a2745115d8996 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Fri, 22 May 2026 15:12:40 +0200 Subject: [PATCH 11/17] test: add ensure_output_dir unit tests --- tests/test_units.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tests/test_units.py b/tests/test_units.py index 39435d3..4f977e1 100644 --- a/tests/test_units.py +++ b/tests/test_units.py @@ -1,12 +1,14 @@ """Pure-function unit tests — no I/O, fully deterministic.""" import math +from pathlib import Path import pytest from krakenparser.counts.processing_script import modify_taxa_names from krakenparser.mpa.transform2mpa import _parse_line from krakenparser.stats.diversity import chao1_index, pielou_evenness, shannon_index +from krakenparser.utils import ensure_output_dir # --------------------------------------------------------------------------- # _parse_line @@ -147,3 +149,37 @@ def test_modify_taxa_names_count_fields_not_modified(): # Underscores in tab-separated count fields must be preserved result = modify_taxa_names("s__My_taxon\t1_000\t2_000") assert result == "My taxon\t1_000\t2_000" + + +# --------------------------------------------------------------------------- +# ensure_output_dir +# --------------------------------------------------------------------------- + + +def test_ensure_output_dir_file_creates_parent(tmp_path): + p = ensure_output_dir(tmp_path / "subdir" / "output.csv", is_file=True) + assert (tmp_path / "subdir").is_dir() + assert not p.exists() # only the parent is created, not the file itself + + +def test_ensure_output_dir_dir_creates_directory(tmp_path): + p = ensure_output_dir(tmp_path / "output_dir", is_file=False) + assert p.is_dir() + + +def test_ensure_output_dir_nested_creates_all_parents(tmp_path): + p = ensure_output_dir(tmp_path / "a" / "b" / "c", is_file=False) + assert p.is_dir() + + +def test_ensure_output_dir_returns_path_object(tmp_path): + p = ensure_output_dir(str(tmp_path / "out.csv"), is_file=True) + assert isinstance(p, Path) + + +def test_ensure_output_dir_idempotent_for_existing_dir(tmp_path): + existing = tmp_path / "already_exists" + existing.mkdir() + p = ensure_output_dir(existing, is_file=False) + assert p == existing + assert p.is_dir() From 758b07ecc4b41b8bd4b89020a2161fb6f46fd24b Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Fri, 22 May 2026 15:12:52 +0200 Subject: [PATCH 12/17] test: cover auto-create dirs, new validations, new exception types --- tests/test_integration.py | 80 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/tests/test_integration.py b/tests/test_integration.py index 35913d3..18d747f 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -9,6 +9,7 @@ from krakenparser.counts.convert2csv import convert_to_csv from krakenparser.counts.processing_script import process_files from krakenparser.counts.split_mpa import split_mpa +from krakenparser.mpa.mpa_table import combine_mpa from krakenparser.mpa.transform2mpa import kreport_to_mpa from krakenparser.stats.diversity import calc_alpha_div, calc_beta_div from krakenparser.stats.relabund import calculate_rel_abund @@ -361,3 +362,82 @@ def test_split_mpa_genus_excludes_species_lines(combined_mpa_file, tmp_path): def test_split_mpa_missing_input_raises(tmp_path): with pytest.raises(FileNotFoundError): split_mpa(str(tmp_path / "ghost.txt"), str(tmp_path / "out")) + + +# --------------------------------------------------------------------------- +# auto-create output directories (ensure_output_dir behaviour) +# --------------------------------------------------------------------------- + + +def test_kreport_to_mpa_creates_output_dir(kreport_file, tmp_path): + out = tmp_path / "new_subdir" / "out.MPA.TXT" + kreport_to_mpa(str(kreport_file), str(out)) + assert out.exists() + + +def test_kreport_to_mpa_missing_input_raises(tmp_path): + with pytest.raises(FileNotFoundError): + kreport_to_mpa(str(tmp_path / "ghost.kreport"), str(tmp_path / "out.MPA.TXT")) + + +def test_convert_to_csv_creates_output_dir(counts_txt_file, tmp_path): + out = tmp_path / "new_subdir" / "counts.csv" + convert_to_csv(str(counts_txt_file), str(out)) + assert out.exists() + + +def test_relabund_creates_output_dir(counts_csv_file, tmp_path): + out = tmp_path / "new_subdir" / "ra.csv" + calculate_rel_abund(str(counts_csv_file), str(out)) + assert out.exists() + + +def test_alpha_div_creates_output_dir(counts_csv_file, tmp_path): + df = pd.read_csv(counts_csv_file, index_col=0) + out_dir = tmp_path / "new_dir" / "nested" + calc_alpha_div(df, out_dir) + assert (out_dir / "alpha_div.csv").exists() + + +def test_beta_div_creates_output_dir(counts_csv_file, tmp_path): + df = pd.read_csv(counts_csv_file, index_col=0) + out_dir = tmp_path / "new_dir" / "nested" + calc_beta_div(df, out_dir, rarefaction_depth=1000, seed=42) + assert (out_dir / "beta_div_bray.csv").exists() + + +# --------------------------------------------------------------------------- +# combine_mpa — new input validation +# --------------------------------------------------------------------------- + +SAMPLE_MPA_A = "#Classification\tsample1\nd__Bacteria|s__Pseudomonas_aeruginosa\t300\n" +SAMPLE_MPA_B = "#Classification\tsample2\nd__Bacteria|s__Pseudomonas_aeruginosa\t100\n" + + +def test_combine_mpa_creates_output_dir(tmp_path): + a = tmp_path / "a.MPA.TXT" + b = tmp_path / "b.MPA.TXT" + a.write_text(SAMPLE_MPA_A) + b.write_text(SAMPLE_MPA_B) + out = tmp_path / "new_subdir" / "COMBINED.txt" + combine_mpa([str(a), str(b)], str(out)) + assert out.exists() + + +def test_combine_mpa_missing_input_raises(tmp_path): + existing = tmp_path / "a.MPA.TXT" + existing.write_text(SAMPLE_MPA_A) + with pytest.raises(FileNotFoundError): + combine_mpa([str(existing), str(tmp_path / "ghost.MPA.TXT")], str(tmp_path / "out.txt")) + + +# --------------------------------------------------------------------------- +# process_files — destination must already exist (in-place modifier) +# --------------------------------------------------------------------------- + + +def test_process_files_missing_dest_still_raises(tmp_path): + source = tmp_path / "COMBINED.txt" + source.write_text("#Classification\tsample1.kreport\n") + with pytest.raises(FileNotFoundError): + process_files(str(source), str(tmp_path / "nonexistent.txt")) From 87b9da04b319f6d8509d3af7ecbcfc745327fc4f Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Fri, 22 May 2026 15:13:06 +0200 Subject: [PATCH 13/17] =?UTF-8?q?test(pipeline):=20fix=20overwrite=20test?= =?UTF-8?q?=20=E2=80=94=20FileExistsError,=20not=20SystemExit?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_full_pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_full_pipeline.py b/tests/test_full_pipeline.py index efa6854..72df80f 100644 --- a/tests/test_full_pipeline.py +++ b/tests/test_full_pipeline.py @@ -61,8 +61,8 @@ def test_pipeline_overwrite_protection(demo_run): run_pipeline(str(kreports_path)) - # Second run without --overwrite must exit - with pytest.raises(SystemExit): + # Second run without --overwrite must raise (library function, not sys.exit) + with pytest.raises(FileExistsError): run_pipeline(str(kreports_path)) From fff6234b4e9eeeeb4027a678d4176e0d33a7d070 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Fri, 22 May 2026 15:24:09 +0200 Subject: [PATCH 14/17] test: add CLI smoke tests for all main() entry points --- tests/test_cli.py | 199 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 199 insertions(+) create mode 100644 tests/test_cli.py diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..27eadd5 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,199 @@ +"""Smoke tests for CLI entry-points (main() functions via sys.argv monkeypatching).""" + +import shutil +import sys +import warnings + +import pandas as pd +import pytest + +from krakenparser.counts.convert2csv import main as convert2csv_main +from krakenparser.counts.processing_script import main as processing_main +from krakenparser.counts.split_mpa import main as split_mpa_main +from krakenparser.mpa.mpa_table import main as mpa_table_main +from krakenparser.mpa.transform2mpa import main as transform2mpa_main +from krakenparser.pipeline import main as pipeline_main +from krakenparser.stats.diversity import main as diversity_main +from krakenparser.stats.relabund import main as relabund_main + +_MPA_A = "#Classification\tsample1\nd__Bacteria|s__Pseudomonas_aeruginosa\t300\n" +_MPA_B = "#Classification\tsample2\nd__Bacteria|s__Pseudomonas_aeruginosa\t100\n" + +_COMBINED_MPA = ( + "#Classification\tsample1\tsample2\n" + "d__Bacteria|p__Pseudomonadota|g__Pseudomonas|s__Pseudomonas_aeruginosa\t300\t100\n" + "d__Bacteria|p__Bacteroidota\t100\t80\n" +) + + +# --------------------------------------------------------------------------- +# convert2csv +# --------------------------------------------------------------------------- + + +def test_convert2csv_main(counts_txt_file, tmp_path, monkeypatch): + out = tmp_path / "out.csv" + monkeypatch.setattr(sys, "argv", ["c2c", "-i", str(counts_txt_file), "-o", str(out)]) + convert2csv_main() + assert out.exists() + + +# --------------------------------------------------------------------------- +# processing_script +# --------------------------------------------------------------------------- + + +def test_processing_main(tmp_path, monkeypatch): + source = tmp_path / "COMBINED.txt" + source.write_text("#Classification\tsample1.kreport\n") + dest = tmp_path / "counts.txt" + dest.write_text("s__Pseudomonas_aeruginosa\t100\n") + monkeypatch.setattr(sys, "argv", ["ps", "-i", str(source), "-o", str(dest)]) + processing_main() + + +# --------------------------------------------------------------------------- +# split_mpa +# --------------------------------------------------------------------------- + + +def test_split_mpa_main(tmp_path, monkeypatch): + combined = tmp_path / "COMBINED.txt" + combined.write_text(_COMBINED_MPA) + out = tmp_path / "out" + monkeypatch.setattr(sys, "argv", ["sm", "-i", str(combined), "-o", str(out)]) + split_mpa_main() + assert (out / "txt" / "counts_species.txt").exists() + + +def test_split_mpa_main_viruses_only(tmp_path, monkeypatch): + combined = tmp_path / "COMBINED.txt" + combined.write_text(_COMBINED_MPA + "d__Viruses|s__Virus_X\t5\t3\n") + out = tmp_path / "out" + monkeypatch.setattr( + sys, "argv", ["sm", "-i", str(combined), "-o", str(out), "--viruses-only"] + ) + split_mpa_main() + + +def test_split_mpa_main_keep_human(tmp_path, monkeypatch): + combined = tmp_path / "COMBINED.txt" + combined.write_text(_COMBINED_MPA) + out = tmp_path / "out" + monkeypatch.setattr( + sys, "argv", ["sm", "-i", str(combined), "-o", str(out), "--keep-human"] + ) + split_mpa_main() + + +# --------------------------------------------------------------------------- +# mpa_table +# --------------------------------------------------------------------------- + + +def test_mpa_table_main(tmp_path, monkeypatch): + a, b = tmp_path / "a.MPA.TXT", tmp_path / "b.MPA.TXT" + a.write_text(_MPA_A) + b.write_text(_MPA_B) + out = tmp_path / "COMBINED.txt" + monkeypatch.setattr( + sys, "argv", ["mt", "-i", str(a), str(b), "-o", str(out)] + ) + mpa_table_main() + assert out.exists() + + +# --------------------------------------------------------------------------- +# transform2mpa +# --------------------------------------------------------------------------- + + +def test_transform2mpa_main_single(kreport_file, tmp_path, monkeypatch): + out = tmp_path / "out.MPA.TXT" + monkeypatch.setattr( + sys, "argv", ["t2m", "-r", str(kreport_file), "-o", str(out)] + ) + transform2mpa_main() + assert out.exists() + + +def test_transform2mpa_main_batch(kreport_file, tmp_path, monkeypatch): + kreports_dir = tmp_path / "kreports" + kreports_dir.mkdir() + shutil.copy(kreport_file, kreports_dir / kreport_file.name) + out_dir = tmp_path / "mpa_out" + monkeypatch.setattr( + sys, "argv", ["t2m", "-i", str(kreports_dir), "-o", str(out_dir)] + ) + transform2mpa_main() + assert out_dir.is_dir() + + +# --------------------------------------------------------------------------- +# diversity +# --------------------------------------------------------------------------- + + +def test_diversity_main_with_seed(counts_csv_file, tmp_path, monkeypatch): + out_dir = tmp_path / "div" + monkeypatch.setattr( + sys, + "argv", + ["div", "-i", str(counts_csv_file), "-o", str(out_dir), "-d", "1000", "-s", "42"], + ) + diversity_main() + assert (out_dir / "alpha_div.csv").exists() + + +def test_diversity_main_no_seed(counts_csv_file, tmp_path, monkeypatch): + out_dir = tmp_path / "div" + monkeypatch.setattr( + sys, "argv", ["div", "-i", str(counts_csv_file), "-o", str(out_dir), "-d", "1000"] + ) + diversity_main() + + +# --------------------------------------------------------------------------- +# relabund +# --------------------------------------------------------------------------- + + +def test_relabund_main(counts_csv_file, tmp_path, monkeypatch): + out = tmp_path / "ra.csv" + monkeypatch.setattr(sys, "argv", ["ra", "-i", str(counts_csv_file), "-o", str(out)]) + relabund_main() + assert out.exists() + + +def test_relabund_main_with_other_threshold(counts_csv_file, tmp_path, monkeypatch): + out = tmp_path / "ra.csv" + monkeypatch.setattr( + sys, "argv", ["ra", "-i", str(counts_csv_file), "-o", str(out), "-O", "50"] + ) + relabund_main() + + +def test_relabund_warns_zero_abundance_sample(tmp_path): + df = pd.DataFrame( + {"Sample_id": ["S1", "S2"], "Taxon_A": [0, 100], "Taxon_B": [0, 200]} + ) + csv_in = tmp_path / "counts.csv" + df.to_csv(csv_in, index=False) + out = tmp_path / "ra.csv" + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + from krakenparser.stats.relabund import calculate_rel_abund + + calculate_rel_abund(str(csv_in), str(out)) + assert any("zero total abundance" in str(w.message) for w in caught) + + +# --------------------------------------------------------------------------- +# pipeline (error paths only — success path covered by test_full_pipeline.py) +# --------------------------------------------------------------------------- + + +def test_pipeline_main_missing_input_exits(tmp_path, monkeypatch): + monkeypatch.setattr(sys, "argv", ["pipeline", "-i", str(tmp_path / "ghost")]) + with pytest.raises(SystemExit): + pipeline_main() From da16a3bcf2ffd06459225401e0ec45c045289613 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Fri, 22 May 2026 15:24:30 +0200 Subject: [PATCH 15/17] =?UTF-8?q?test:=20cover=20edge=20cases=20=E2=80=94?= =?UTF-8?q?=20=5Fstrip=5Fpath=5Fprefix,=20=5Fis=5Fprocessable,=20t=5F=5F?= =?UTF-8?q?=20filter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_integration.py | 47 +++++++++++++++++++++++++++++++++++++++ tests/test_units.py | 14 ++++++++++++ 2 files changed, 61 insertions(+) diff --git a/tests/test_integration.py b/tests/test_integration.py index 18d747f..5ea580f 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -11,6 +11,7 @@ from krakenparser.counts.split_mpa import split_mpa from krakenparser.mpa.mpa_table import combine_mpa from krakenparser.mpa.transform2mpa import kreport_to_mpa +from krakenparser.pipeline import _is_processable from krakenparser.stats.diversity import calc_alpha_div, calc_beta_div from krakenparser.stats.relabund import calculate_rel_abund @@ -441,3 +442,49 @@ def test_process_files_missing_dest_still_raises(tmp_path): source.write_text("#Classification\tsample1.kreport\n") with pytest.raises(FileNotFoundError): process_files(str(source), str(tmp_path / "nonexistent.txt")) + + +# --------------------------------------------------------------------------- +# split_mpa — t__ rank filter (intermediate terminal nodes) +# --------------------------------------------------------------------------- + + +def test_split_mpa_filters_terminal_rank_nodes(tmp_path): + combined = tmp_path / "COMBINED.txt" + combined.write_text( + "#Classification\tsample1\n" + "d__Bacteria|p__Pseudomonadota|s__Pseudomonas_aeruginosa\t300\n" + "d__Bacteria|p__Pseudomonadota|s__Pseudomonas_aeruginosa|t__strain_X\t10\n" + ) + split_mpa(str(combined), str(tmp_path / "out")) + species = (tmp_path / "out" / "txt" / "counts_species.txt").read_text() + assert "t__" not in species + + +# --------------------------------------------------------------------------- +# _is_processable — hidden files, null bytes, non-UTF-8 +# --------------------------------------------------------------------------- + + +def test_is_processable_hidden_file(tmp_path): + f = tmp_path / ".hidden" + f.write_text("content") + assert not _is_processable(f) + + +def test_is_processable_null_bytes(tmp_path): + f = tmp_path / "binary.bin" + f.write_bytes(b"hello\x00world") + assert not _is_processable(f) + + +def test_is_processable_non_utf8(tmp_path): + f = tmp_path / "latin1.txt" + f.write_bytes(b"\xff\xfe bad encoding") + assert not _is_processable(f) + + +def test_is_processable_valid_kreport(tmp_path): + f = tmp_path / "sample.kreport" + f.write_text("50.0\t500\t100\tS\t1\tBacteria\n") + assert _is_processable(f) diff --git a/tests/test_units.py b/tests/test_units.py index 4f977e1..ee96f56 100644 --- a/tests/test_units.py +++ b/tests/test_units.py @@ -6,6 +6,7 @@ import pytest from krakenparser.counts.processing_script import modify_taxa_names +from krakenparser.counts.split_mpa import _strip_path_prefix from krakenparser.mpa.transform2mpa import _parse_line from krakenparser.stats.diversity import chao1_index, pielou_evenness, shannon_index from krakenparser.utils import ensure_output_dir @@ -151,6 +152,19 @@ def test_modify_taxa_names_count_fields_not_modified(): assert result == "My taxon\t1_000\t2_000" +# --------------------------------------------------------------------------- +# _strip_path_prefix +# --------------------------------------------------------------------------- + + +def test_strip_path_prefix_tab_less_line(): + assert _strip_path_prefix("no_tab_here") == "no_tab_here" + + +def test_strip_path_prefix_normal(): + assert _strip_path_prefix("d__Bacteria|s__E_coli\t100\t200") == "s__E_coli\t100\t200" + + # --------------------------------------------------------------------------- # ensure_output_dir # --------------------------------------------------------------------------- From 672e9532573ed81fb4f781c0c88783c20ff14532 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Fri, 22 May 2026 15:24:41 +0200 Subject: [PATCH 16/17] fix(codecov): correct status schema and move patch to informational --- codecov.yml | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/codecov.yml b/codecov.yml index 4200a1c..433678e 100644 --- a/codecov.yml +++ b/codecov.yml @@ -1,4 +1,9 @@ coverage: - patch: - target: 78% - informational: true \ No newline at end of file + status: + project: + default: + target: auto + threshold: 1% + patch: + default: + informational: true From ae1a78cd1db491fbdf014896ecae2524e7a1d20b Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Fri, 22 May 2026 15:27:42 +0200 Subject: [PATCH 17/17] style(scripts): apply Ruff formatter --- krakenparser/stats/diversity.py | 6 +++++- tests/test_cli.py | 28 +++++++++++++++++++--------- tests/test_integration.py | 4 +++- tests/test_units.py | 4 +++- 4 files changed, 30 insertions(+), 12 deletions(-) diff --git a/krakenparser/stats/diversity.py b/krakenparser/stats/diversity.py index dc39eb4..4977f55 100644 --- a/krakenparser/stats/diversity.py +++ b/krakenparser/stats/diversity.py @@ -133,7 +133,11 @@ def main() -> None: ) args = parser.parse_args() - seed_label = str(args.seed) if args.seed is not None else "not set (results will vary between runs)" + seed_label = ( + str(args.seed) + if args.seed is not None + else "not set (results will vary between runs)" + ) _log.info("Rarefaction depth: %d | seed: %s", args.depth, seed_label) input_file = Path(args.input) diff --git a/tests/test_cli.py b/tests/test_cli.py index 27eadd5..9badb19 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -33,7 +33,9 @@ def test_convert2csv_main(counts_txt_file, tmp_path, monkeypatch): out = tmp_path / "out.csv" - monkeypatch.setattr(sys, "argv", ["c2c", "-i", str(counts_txt_file), "-o", str(out)]) + monkeypatch.setattr( + sys, "argv", ["c2c", "-i", str(counts_txt_file), "-o", str(out)] + ) convert2csv_main() assert out.exists() @@ -96,9 +98,7 @@ def test_mpa_table_main(tmp_path, monkeypatch): a.write_text(_MPA_A) b.write_text(_MPA_B) out = tmp_path / "COMBINED.txt" - monkeypatch.setattr( - sys, "argv", ["mt", "-i", str(a), str(b), "-o", str(out)] - ) + monkeypatch.setattr(sys, "argv", ["mt", "-i", str(a), str(b), "-o", str(out)]) mpa_table_main() assert out.exists() @@ -110,9 +110,7 @@ def test_mpa_table_main(tmp_path, monkeypatch): def test_transform2mpa_main_single(kreport_file, tmp_path, monkeypatch): out = tmp_path / "out.MPA.TXT" - monkeypatch.setattr( - sys, "argv", ["t2m", "-r", str(kreport_file), "-o", str(out)] - ) + monkeypatch.setattr(sys, "argv", ["t2m", "-r", str(kreport_file), "-o", str(out)]) transform2mpa_main() assert out.exists() @@ -139,7 +137,17 @@ def test_diversity_main_with_seed(counts_csv_file, tmp_path, monkeypatch): monkeypatch.setattr( sys, "argv", - ["div", "-i", str(counts_csv_file), "-o", str(out_dir), "-d", "1000", "-s", "42"], + [ + "div", + "-i", + str(counts_csv_file), + "-o", + str(out_dir), + "-d", + "1000", + "-s", + "42", + ], ) diversity_main() assert (out_dir / "alpha_div.csv").exists() @@ -148,7 +156,9 @@ def test_diversity_main_with_seed(counts_csv_file, tmp_path, monkeypatch): def test_diversity_main_no_seed(counts_csv_file, tmp_path, monkeypatch): out_dir = tmp_path / "div" monkeypatch.setattr( - sys, "argv", ["div", "-i", str(counts_csv_file), "-o", str(out_dir), "-d", "1000"] + sys, + "argv", + ["div", "-i", str(counts_csv_file), "-o", str(out_dir), "-d", "1000"], ) diversity_main() diff --git a/tests/test_integration.py b/tests/test_integration.py index 5ea580f..3fe61e6 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -429,7 +429,9 @@ def test_combine_mpa_missing_input_raises(tmp_path): existing = tmp_path / "a.MPA.TXT" existing.write_text(SAMPLE_MPA_A) with pytest.raises(FileNotFoundError): - combine_mpa([str(existing), str(tmp_path / "ghost.MPA.TXT")], str(tmp_path / "out.txt")) + combine_mpa( + [str(existing), str(tmp_path / "ghost.MPA.TXT")], str(tmp_path / "out.txt") + ) # --------------------------------------------------------------------------- diff --git a/tests/test_units.py b/tests/test_units.py index ee96f56..414d87d 100644 --- a/tests/test_units.py +++ b/tests/test_units.py @@ -162,7 +162,9 @@ def test_strip_path_prefix_tab_less_line(): def test_strip_path_prefix_normal(): - assert _strip_path_prefix("d__Bacteria|s__E_coli\t100\t200") == "s__E_coli\t100\t200" + assert ( + _strip_path_prefix("d__Bacteria|s__E_coli\t100\t200") == "s__E_coli\t100\t200" + ) # ---------------------------------------------------------------------------