diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 553faa6..fe4fe56 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -27,14 +27,14 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install flake8 setuptools wheel + python -m pip install ruff setuptools wheel pip install -e ".[dev]" --no-build-isolation - - name: Lint with flake8 + - name: Lint with ruff run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + # Run linter and fail on any rule violation + ruff check . + # Check if the code format matches Ruff style guide + ruff format --check . - name: Test with pytest run: | pytest --cov=krakenparser --cov-report=xml diff --git a/codecov.yml b/codecov.yml index 4200a1c..433678e 100644 --- a/codecov.yml +++ b/codecov.yml @@ -1,4 +1,9 @@ coverage: - patch: - target: 78% - informational: true \ No newline at end of file + status: + project: + default: + target: auto + threshold: 1% + patch: + default: + informational: true diff --git a/krakenparser/counts/convert2csv.py b/krakenparser/counts/convert2csv.py index 1177b91..9fb763e 100755 --- a/krakenparser/counts/convert2csv.py +++ b/krakenparser/counts/convert2csv.py @@ -6,6 +6,8 @@ import pandas as pd +from krakenparser.utils import ensure_output_dir + _log = logging.getLogger(__name__) @@ -13,17 +15,15 @@ def convert_to_csv(input_file, output_file): in_path = Path(input_file) if not in_path.is_file(): raise FileNotFoundError(f"Input file not found: {in_path}") - out_path = Path(output_file) - if not out_path.parent.exists(): - raise FileNotFoundError(f"Output directory does not exist: {out_path.parent}") + out_path = ensure_output_dir(output_file, is_file=True) data = pd.read_csv(in_path, sep="\t", index_col=0) data.T.to_csv(out_path, index_label="Sample_id") _log.info("Data converted and saved as '%s'.", output_file) -if __name__ == "__main__": - # Use argparse to handle command-line arguments +def main() -> None: + logging.basicConfig(level=logging.INFO, format="%(message)s") parser = argparse.ArgumentParser( description="Reads a TXT file, reorganizes the data, and converts it into a CSV file." ) @@ -39,8 +39,9 @@ def convert_to_csv(input_file, output_file): required=True, help="Path to the output CSV file. The script will restructure the data and save it here.", ) - args = parser.parse_args() - - # Call function with parsed arguments convert_to_csv(args.input, args.output) + + +if __name__ == "__main__": + main() diff --git a/krakenparser/counts/processing_script.py b/krakenparser/counts/processing_script.py index 831910e..0d6d59a 100755 --- a/krakenparser/counts/processing_script.py +++ b/krakenparser/counts/processing_script.py @@ -1,10 +1,13 @@ #!/usr/bin/env python import argparse +import logging import os import tempfile from pathlib import Path +_log = logging.getLogger(__name__) + def modify_taxa_names(line): prefixes = ["s__", "g__", "f__", "o__", "c__", "p__"] @@ -47,11 +50,11 @@ def process_files(source_file, destination_file): tmp_path = tmp.name os.replace(tmp_path, dest_path) - print(f"Processed {destination_file} successfully.") + _log.info(f"Processed {destination_file} successfully.") -if __name__ == "__main__": - # Use argparse to parse command-line arguments +def main() -> None: + logging.basicConfig(level=logging.INFO, format="%(message)s") parser = argparse.ArgumentParser( description="Reads a source file, processes its first line, modifies taxa names in a destination file, and updates it." ) @@ -67,8 +70,9 @@ def process_files(source_file, destination_file): required=True, help="Path to the destination file. This file's contents will be updated with cleaned taxa names.", ) - args = parser.parse_args() - - # Call the function with parsed arguments process_files(args.input, args.output) + + +if __name__ == "__main__": + main() diff --git a/krakenparser/counts/split_mpa.py b/krakenparser/counts/split_mpa.py index 0ee87b8..1af110e 100644 --- a/krakenparser/counts/split_mpa.py +++ b/krakenparser/counts/split_mpa.py @@ -9,6 +9,8 @@ import re from pathlib import Path +from krakenparser.utils import ensure_output_dir + _log = logging.getLogger(__name__) @@ -53,8 +55,8 @@ def split_mpa( in_path = Path(input_file) if not in_path.is_file(): raise FileNotFoundError(f"Input file not found: {in_path}") - out_path = Path(output_dir) - (out_path / "txt").mkdir(parents=True, exist_ok=True) + out_path = ensure_output_dir(output_dir, is_file=False) + (out_path / "txt").mkdir(exist_ok=True) lines = in_path.read_text().splitlines() data_lines = [ln for ln in lines if not ln.startswith("#") and ln.strip()] @@ -86,6 +88,7 @@ def split_mpa( def main() -> None: + logging.basicConfig(level=logging.INFO, format="%(message)s") parser = argparse.ArgumentParser( description="Split a combined MPA table into per-rank TXT files." ) diff --git a/krakenparser/kpplot/__init__.py b/krakenparser/kpplot/__init__.py index 147afc0..c353a3e 100644 --- a/krakenparser/kpplot/__init__.py +++ b/krakenparser/kpplot/__init__.py @@ -1 +1,3 @@ from .base import KpPlotBase + +__all__ = ["KpPlotBase"] diff --git a/krakenparser/mpa/mpa_table.py b/krakenparser/mpa/mpa_table.py index c808383..2837972 100644 --- a/krakenparser/mpa/mpa_table.py +++ b/krakenparser/mpa/mpa_table.py @@ -3,17 +3,25 @@ import argparse import logging +from pathlib import Path + +from krakenparser.utils import ensure_output_dir _log = logging.getLogger(__name__) def combine_mpa(in_files: list[str], o_file: str) -> None: + out_path = ensure_output_dir(o_file, is_file=True) # Plain dict preserves insertion order (Python 3.7+). taxa: dict[str, dict[int, str]] = {} sample_names: list[str] = [] _log.info("Number of files to parse: %d", len(in_files)) + for in_path in in_files: + if not Path(in_path).is_file(): + raise FileNotFoundError(f"Input file not found: {in_path}") + for idx, in_path in enumerate(in_files): sample_name = f"Sample #{idx + 1}" with open(in_path) as fh: @@ -39,7 +47,7 @@ def combine_mpa(in_files: list[str], o_file: str) -> None: n_taxa = len(taxa) _log.info("Number of classifications to write: %d", n_taxa) - with open(o_file, "w") as fh: + with open(out_path, "w") as fh: fh.write("#Classification\t" + "\t".join(sample_names) + "\n") for taxon, counts in taxa.items(): row = [counts.get(i, "0") for i in range(n_samples)] @@ -49,6 +57,7 @@ def combine_mpa(in_files: list[str], o_file: str) -> None: def main() -> None: + logging.basicConfig(level=logging.INFO, format="%(message)s") parser = argparse.ArgumentParser( description="Combine MPA files into a single tab-delimited table." ) diff --git a/krakenparser/mpa/transform2mpa.py b/krakenparser/mpa/transform2mpa.py index a28cde6..8d362f7 100644 --- a/krakenparser/mpa/transform2mpa.py +++ b/krakenparser/mpa/transform2mpa.py @@ -2,10 +2,13 @@ """Convert a Kraken2 report to MetaPhlAn (MPA) format.""" import argparse +import logging import os import sys from pathlib import Path +from krakenparser.utils import ensure_output_dir + # Maps Kraken2 single-letter rank codes to MPA prefixes _RANK_PREFIX = { "D": "d", @@ -18,6 +21,8 @@ "S": "s", } +_log = logging.getLogger(__name__) + def _parse_line(line: str): """ @@ -67,10 +72,13 @@ def kreport_to_mpa( depth d is encountered, all stack entries with depth >= d are popped before the new entry is pushed, keeping the path consistent. """ + if not Path(report_path).is_file(): + raise FileNotFoundError(f"Input file not found: {report_path}") + out_path = ensure_output_dir(output_path, is_file=True) # Stack entries: (structural_depth, mpa_segment, is_standard_rank) stack: list[tuple[int, str, bool]] = [] - with open(report_path) as r_fh, open(output_path, "w") as o_fh: + with open(report_path) as r_fh, open(out_path, "w") as o_fh: if display_header: o_fh.write("#Classification\t" + os.path.basename(report_path) + "\n") @@ -110,6 +118,7 @@ def kreport_to_mpa( def main() -> None: + logging.basicConfig(level=logging.INFO, format="%(message)s") parser = argparse.ArgumentParser( description="Convert a Kraken2 report to MetaPhlAn (MPA) format." ) @@ -203,7 +212,7 @@ def main() -> None: continue out_name = f.name.replace(".kreport", ".MPA.TXT") kreport_to_mpa(str(f), str(output_dir / out_name), **kwargs) - print(f"Converted to MPA successfully. Output stored in {output_dir}") + _log.info(f"Converted to MPA successfully. Output stored in {output_dir}") else: kreport_to_mpa(args.r_file, args.o_file, **kwargs) diff --git a/krakenparser/pipeline.py b/krakenparser/pipeline.py index c54b4e9..4f51e10 100644 --- a/krakenparser/pipeline.py +++ b/krakenparser/pipeline.py @@ -7,8 +7,6 @@ import sys from pathlib import Path -_log = logging.getLogger(__name__) - import pandas as pd from krakenparser.counts.convert2csv import convert_to_csv @@ -19,6 +17,8 @@ from krakenparser.stats.diversity import calc_alpha_div, calc_beta_div from krakenparser.stats.relabund import calculate_rel_abund +_log = logging.getLogger(__name__) + def _is_processable(path: Path) -> bool: """Return False for hidden files, files with null bytes, or non-UTF-8 files.""" @@ -47,7 +47,7 @@ def run_pipeline( ) -> None: source_dir = Path(input_dir) if not source_dir.is_dir(): - sys.exit(f"Error: input directory not found: {source_dir}") + raise FileNotFoundError(f"Input directory not found: {source_dir}") out_dir = Path(output_dir) if output_dir else source_dir.parent out_dir.mkdir(parents=True, exist_ok=True) @@ -55,8 +55,8 @@ def run_pipeline( existing = [out_dir / d for d in _OUTPUT_SUBDIRS if (out_dir / d).exists()] if existing and not overwrite: names = ", ".join(d.name for d in existing) - sys.exit( - f"Error: output already exists in '{out_dir}' ({names}).\n" + raise FileExistsError( + f"Output already exists in '{out_dir}' ({names}).\n" "Use --overwrite to overwrite it." ) if overwrite: @@ -164,14 +164,17 @@ def main() -> None: help="Overwrite the output directory if it already exists", ) args = parser.parse_args() - run_pipeline( - args.input, - args.output, - keep_human=args.keep_human, - rarefaction_depth=args.depth, - seed=args.seed, - overwrite=args.overwrite, - ) + try: + run_pipeline( + args.input, + args.output, + keep_human=args.keep_human, + rarefaction_depth=args.depth, + seed=args.seed, + overwrite=args.overwrite, + ) + except (FileNotFoundError, FileExistsError) as e: + sys.exit(f"Error: {e}") if __name__ == "__main__": diff --git a/krakenparser/stats/diversity.py b/krakenparser/stats/diversity.py index dd76098..4977f55 100644 --- a/krakenparser/stats/diversity.py +++ b/krakenparser/stats/diversity.py @@ -1,6 +1,7 @@ #!/usr/bin/env python import argparse +import logging import sys from pathlib import Path @@ -8,6 +9,10 @@ import pandas as pd from scipy.spatial.distance import pdist, squareform +from krakenparser.utils import ensure_output_dir + +_log = logging.getLogger(__name__) + def shannon_index(counts): counts = np.array(counts) @@ -44,6 +49,7 @@ def _subsample_counts( def calc_alpha_div(df, output_path): + out_path = ensure_output_dir(output_path, is_file=False) results = [] for sample_id, row in df.iterrows(): counts = row.values @@ -56,42 +62,53 @@ def calc_alpha_div(df, output_path): } ) alpha_df = pd.DataFrame(results).set_index("Sample") - alpha_df.to_csv(output_path / "alpha_div.csv") + alpha_df.to_csv(out_path / "alpha_div.csv") + + _log.info( + f"α-diversity has been successfully calculated and saved to '{output_path}'." + ) def calc_beta_div(df, output_path, rarefaction_depth, seed=None): + out_path = ensure_output_dir(output_path, is_file=False) rng = np.random.default_rng(seed) - rarefied_counts = [] - sample_ids = [] + rarefied_counts: list[np.ndarray] = [] + sample_ids: list[str] = [] for sample, row in df.iterrows(): counts = np.round(row.values).astype(int) if counts.sum() >= rarefaction_depth: rarefied = _subsample_counts(counts, n=rarefaction_depth, rng=rng) rarefied_counts.append(rarefied) - sample_ids.append(sample) + sample_ids.append(str(sample)) if len(rarefied_counts) < 2: raise ValueError("Not enough samples passed the rarefaction threshold.") X = np.array(rarefied_counts, dtype=float) + idx = pd.Index(sample_ids) bray_df = pd.DataFrame( squareform(pdist(X, metric="braycurtis")), - index=sample_ids, - columns=sample_ids, + index=idx, + columns=idx, ) jaccard_df = pd.DataFrame( squareform(pdist(X.astype(bool).astype(float), metric="jaccard")), - index=sample_ids, - columns=sample_ids, + index=idx, + columns=idx, ) - bray_df.to_csv(output_path / "beta_div_bray.csv") - jaccard_df.to_csv(output_path / "beta_div_jaccard.csv") + bray_df.to_csv(out_path / "beta_div_bray.csv") + jaccard_df.to_csv(out_path / "beta_div_jaccard.csv") + _log.info( + f"β-diversity has been successfully calculated and saved to '{output_path}'." + ) -if __name__ == "__main__": + +def main() -> None: + logging.basicConfig(level=logging.INFO, format="%(message)s") parser = argparse.ArgumentParser(description="Calculate α & β-diversities.") parser.add_argument( "-i", @@ -116,6 +133,13 @@ def calc_beta_div(df, output_path, rarefaction_depth, seed=None): ) args = parser.parse_args() + seed_label = ( + str(args.seed) + if args.seed is not None + else "not set (results will vary between runs)" + ) + _log.info("Rarefaction depth: %d | seed: %s", args.depth, seed_label) + input_file = Path(args.input) if not input_file.is_file(): sys.exit(f"Error: input file not found: {input_file}") @@ -126,6 +150,7 @@ def calc_beta_div(df, output_path, rarefaction_depth, seed=None): calc_alpha_div(df, output_dir) calc_beta_div(df, output_dir, args.depth, seed=args.seed) - print( - f"α & β-diversities have been successfully calculated and saved to '{output_dir}'." - ) + + +if __name__ == "__main__": + main() diff --git a/krakenparser/stats/relabund.py b/krakenparser/stats/relabund.py index 5b1b0c5..0957329 100644 --- a/krakenparser/stats/relabund.py +++ b/krakenparser/stats/relabund.py @@ -7,6 +7,8 @@ import pandas as pd +from krakenparser.utils import ensure_output_dir + _log = logging.getLogger(__name__) @@ -14,9 +16,7 @@ def calculate_rel_abund(input_file, output_file, other_threshold=None): in_path = Path(input_file) if not in_path.is_file(): raise FileNotFoundError(f"Input file not found: {in_path}") - out_path = Path(output_file) - if not out_path.parent.exists(): - raise FileNotFoundError(f"Output directory does not exist: {out_path.parent}") + out_path = ensure_output_dir(output_file, is_file=True) # Load counts table df = pd.read_csv(in_path) @@ -59,11 +59,12 @@ def calculate_rel_abund(input_file, output_file, other_threshold=None): ) # Save to CSV - result.to_csv(output_file, index=False) + result.to_csv(out_path, index=False) _log.info("Relative abundance saved as '%s'.", output_file) -if __name__ == "__main__": +def main() -> None: + logging.basicConfig(level=logging.INFO, format="%(message)s") parser = argparse.ArgumentParser( description="Calculates taxa relative abundance and saves it to a CSV file." ) @@ -81,3 +82,7 @@ def calculate_rel_abund(input_file, output_file, other_threshold=None): args = parser.parse_args() calculate_rel_abund(args.input, args.output, args.other) + + +if __name__ == "__main__": + main() diff --git a/krakenparser/utils.py b/krakenparser/utils.py new file mode 100644 index 0000000..7f9a780 --- /dev/null +++ b/krakenparser/utils.py @@ -0,0 +1,10 @@ +# krakenparser/utils.py +from pathlib import Path + + +def ensure_output_dir(path: str | Path, is_file: bool = True) -> Path: + """Create parent directory for a file output, or the directory itself.""" + p = Path(path) + target = p.parent if is_file else p + target.mkdir(parents=True, exist_ok=True) + return p diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..9badb19 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,209 @@ +"""Smoke tests for CLI entry-points (main() functions via sys.argv monkeypatching).""" + +import shutil +import sys +import warnings + +import pandas as pd +import pytest + +from krakenparser.counts.convert2csv import main as convert2csv_main +from krakenparser.counts.processing_script import main as processing_main +from krakenparser.counts.split_mpa import main as split_mpa_main +from krakenparser.mpa.mpa_table import main as mpa_table_main +from krakenparser.mpa.transform2mpa import main as transform2mpa_main +from krakenparser.pipeline import main as pipeline_main +from krakenparser.stats.diversity import main as diversity_main +from krakenparser.stats.relabund import main as relabund_main + +_MPA_A = "#Classification\tsample1\nd__Bacteria|s__Pseudomonas_aeruginosa\t300\n" +_MPA_B = "#Classification\tsample2\nd__Bacteria|s__Pseudomonas_aeruginosa\t100\n" + +_COMBINED_MPA = ( + "#Classification\tsample1\tsample2\n" + "d__Bacteria|p__Pseudomonadota|g__Pseudomonas|s__Pseudomonas_aeruginosa\t300\t100\n" + "d__Bacteria|p__Bacteroidota\t100\t80\n" +) + + +# --------------------------------------------------------------------------- +# convert2csv +# --------------------------------------------------------------------------- + + +def test_convert2csv_main(counts_txt_file, tmp_path, monkeypatch): + out = tmp_path / "out.csv" + monkeypatch.setattr( + sys, "argv", ["c2c", "-i", str(counts_txt_file), "-o", str(out)] + ) + convert2csv_main() + assert out.exists() + + +# --------------------------------------------------------------------------- +# processing_script +# --------------------------------------------------------------------------- + + +def test_processing_main(tmp_path, monkeypatch): + source = tmp_path / "COMBINED.txt" + source.write_text("#Classification\tsample1.kreport\n") + dest = tmp_path / "counts.txt" + dest.write_text("s__Pseudomonas_aeruginosa\t100\n") + monkeypatch.setattr(sys, "argv", ["ps", "-i", str(source), "-o", str(dest)]) + processing_main() + + +# --------------------------------------------------------------------------- +# split_mpa +# --------------------------------------------------------------------------- + + +def test_split_mpa_main(tmp_path, monkeypatch): + combined = tmp_path / "COMBINED.txt" + combined.write_text(_COMBINED_MPA) + out = tmp_path / "out" + monkeypatch.setattr(sys, "argv", ["sm", "-i", str(combined), "-o", str(out)]) + split_mpa_main() + assert (out / "txt" / "counts_species.txt").exists() + + +def test_split_mpa_main_viruses_only(tmp_path, monkeypatch): + combined = tmp_path / "COMBINED.txt" + combined.write_text(_COMBINED_MPA + "d__Viruses|s__Virus_X\t5\t3\n") + out = tmp_path / "out" + monkeypatch.setattr( + sys, "argv", ["sm", "-i", str(combined), "-o", str(out), "--viruses-only"] + ) + split_mpa_main() + + +def test_split_mpa_main_keep_human(tmp_path, monkeypatch): + combined = tmp_path / "COMBINED.txt" + combined.write_text(_COMBINED_MPA) + out = tmp_path / "out" + monkeypatch.setattr( + sys, "argv", ["sm", "-i", str(combined), "-o", str(out), "--keep-human"] + ) + split_mpa_main() + + +# --------------------------------------------------------------------------- +# mpa_table +# --------------------------------------------------------------------------- + + +def test_mpa_table_main(tmp_path, monkeypatch): + a, b = tmp_path / "a.MPA.TXT", tmp_path / "b.MPA.TXT" + a.write_text(_MPA_A) + b.write_text(_MPA_B) + out = tmp_path / "COMBINED.txt" + monkeypatch.setattr(sys, "argv", ["mt", "-i", str(a), str(b), "-o", str(out)]) + mpa_table_main() + assert out.exists() + + +# --------------------------------------------------------------------------- +# transform2mpa +# --------------------------------------------------------------------------- + + +def test_transform2mpa_main_single(kreport_file, tmp_path, monkeypatch): + out = tmp_path / "out.MPA.TXT" + monkeypatch.setattr(sys, "argv", ["t2m", "-r", str(kreport_file), "-o", str(out)]) + transform2mpa_main() + assert out.exists() + + +def test_transform2mpa_main_batch(kreport_file, tmp_path, monkeypatch): + kreports_dir = tmp_path / "kreports" + kreports_dir.mkdir() + shutil.copy(kreport_file, kreports_dir / kreport_file.name) + out_dir = tmp_path / "mpa_out" + monkeypatch.setattr( + sys, "argv", ["t2m", "-i", str(kreports_dir), "-o", str(out_dir)] + ) + transform2mpa_main() + assert out_dir.is_dir() + + +# --------------------------------------------------------------------------- +# diversity +# --------------------------------------------------------------------------- + + +def test_diversity_main_with_seed(counts_csv_file, tmp_path, monkeypatch): + out_dir = tmp_path / "div" + monkeypatch.setattr( + sys, + "argv", + [ + "div", + "-i", + str(counts_csv_file), + "-o", + str(out_dir), + "-d", + "1000", + "-s", + "42", + ], + ) + diversity_main() + assert (out_dir / "alpha_div.csv").exists() + + +def test_diversity_main_no_seed(counts_csv_file, tmp_path, monkeypatch): + out_dir = tmp_path / "div" + monkeypatch.setattr( + sys, + "argv", + ["div", "-i", str(counts_csv_file), "-o", str(out_dir), "-d", "1000"], + ) + diversity_main() + + +# --------------------------------------------------------------------------- +# relabund +# --------------------------------------------------------------------------- + + +def test_relabund_main(counts_csv_file, tmp_path, monkeypatch): + out = tmp_path / "ra.csv" + monkeypatch.setattr(sys, "argv", ["ra", "-i", str(counts_csv_file), "-o", str(out)]) + relabund_main() + assert out.exists() + + +def test_relabund_main_with_other_threshold(counts_csv_file, tmp_path, monkeypatch): + out = tmp_path / "ra.csv" + monkeypatch.setattr( + sys, "argv", ["ra", "-i", str(counts_csv_file), "-o", str(out), "-O", "50"] + ) + relabund_main() + + +def test_relabund_warns_zero_abundance_sample(tmp_path): + df = pd.DataFrame( + {"Sample_id": ["S1", "S2"], "Taxon_A": [0, 100], "Taxon_B": [0, 200]} + ) + csv_in = tmp_path / "counts.csv" + df.to_csv(csv_in, index=False) + out = tmp_path / "ra.csv" + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + from krakenparser.stats.relabund import calculate_rel_abund + + calculate_rel_abund(str(csv_in), str(out)) + assert any("zero total abundance" in str(w.message) for w in caught) + + +# --------------------------------------------------------------------------- +# pipeline (error paths only — success path covered by test_full_pipeline.py) +# --------------------------------------------------------------------------- + + +def test_pipeline_main_missing_input_exits(tmp_path, monkeypatch): + monkeypatch.setattr(sys, "argv", ["pipeline", "-i", str(tmp_path / "ghost")]) + with pytest.raises(SystemExit): + pipeline_main() diff --git a/tests/test_full_pipeline.py b/tests/test_full_pipeline.py index efa6854..72df80f 100644 --- a/tests/test_full_pipeline.py +++ b/tests/test_full_pipeline.py @@ -61,8 +61,8 @@ def test_pipeline_overwrite_protection(demo_run): run_pipeline(str(kreports_path)) - # Second run without --overwrite must exit - with pytest.raises(SystemExit): + # Second run without --overwrite must raise (library function, not sys.exit) + with pytest.raises(FileExistsError): run_pipeline(str(kreports_path)) diff --git a/tests/test_integration.py b/tests/test_integration.py index 35913d3..3fe61e6 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -9,7 +9,9 @@ from krakenparser.counts.convert2csv import convert_to_csv from krakenparser.counts.processing_script import process_files from krakenparser.counts.split_mpa import split_mpa +from krakenparser.mpa.mpa_table import combine_mpa from krakenparser.mpa.transform2mpa import kreport_to_mpa +from krakenparser.pipeline import _is_processable from krakenparser.stats.diversity import calc_alpha_div, calc_beta_div from krakenparser.stats.relabund import calculate_rel_abund @@ -361,3 +363,130 @@ def test_split_mpa_genus_excludes_species_lines(combined_mpa_file, tmp_path): def test_split_mpa_missing_input_raises(tmp_path): with pytest.raises(FileNotFoundError): split_mpa(str(tmp_path / "ghost.txt"), str(tmp_path / "out")) + + +# --------------------------------------------------------------------------- +# auto-create output directories (ensure_output_dir behaviour) +# --------------------------------------------------------------------------- + + +def test_kreport_to_mpa_creates_output_dir(kreport_file, tmp_path): + out = tmp_path / "new_subdir" / "out.MPA.TXT" + kreport_to_mpa(str(kreport_file), str(out)) + assert out.exists() + + +def test_kreport_to_mpa_missing_input_raises(tmp_path): + with pytest.raises(FileNotFoundError): + kreport_to_mpa(str(tmp_path / "ghost.kreport"), str(tmp_path / "out.MPA.TXT")) + + +def test_convert_to_csv_creates_output_dir(counts_txt_file, tmp_path): + out = tmp_path / "new_subdir" / "counts.csv" + convert_to_csv(str(counts_txt_file), str(out)) + assert out.exists() + + +def test_relabund_creates_output_dir(counts_csv_file, tmp_path): + out = tmp_path / "new_subdir" / "ra.csv" + calculate_rel_abund(str(counts_csv_file), str(out)) + assert out.exists() + + +def test_alpha_div_creates_output_dir(counts_csv_file, tmp_path): + df = pd.read_csv(counts_csv_file, index_col=0) + out_dir = tmp_path / "new_dir" / "nested" + calc_alpha_div(df, out_dir) + assert (out_dir / "alpha_div.csv").exists() + + +def test_beta_div_creates_output_dir(counts_csv_file, tmp_path): + df = pd.read_csv(counts_csv_file, index_col=0) + out_dir = tmp_path / "new_dir" / "nested" + calc_beta_div(df, out_dir, rarefaction_depth=1000, seed=42) + assert (out_dir / "beta_div_bray.csv").exists() + + +# --------------------------------------------------------------------------- +# combine_mpa — new input validation +# --------------------------------------------------------------------------- + +SAMPLE_MPA_A = "#Classification\tsample1\nd__Bacteria|s__Pseudomonas_aeruginosa\t300\n" +SAMPLE_MPA_B = "#Classification\tsample2\nd__Bacteria|s__Pseudomonas_aeruginosa\t100\n" + + +def test_combine_mpa_creates_output_dir(tmp_path): + a = tmp_path / "a.MPA.TXT" + b = tmp_path / "b.MPA.TXT" + a.write_text(SAMPLE_MPA_A) + b.write_text(SAMPLE_MPA_B) + out = tmp_path / "new_subdir" / "COMBINED.txt" + combine_mpa([str(a), str(b)], str(out)) + assert out.exists() + + +def test_combine_mpa_missing_input_raises(tmp_path): + existing = tmp_path / "a.MPA.TXT" + existing.write_text(SAMPLE_MPA_A) + with pytest.raises(FileNotFoundError): + combine_mpa( + [str(existing), str(tmp_path / "ghost.MPA.TXT")], str(tmp_path / "out.txt") + ) + + +# --------------------------------------------------------------------------- +# process_files — destination must already exist (in-place modifier) +# --------------------------------------------------------------------------- + + +def test_process_files_missing_dest_still_raises(tmp_path): + source = tmp_path / "COMBINED.txt" + source.write_text("#Classification\tsample1.kreport\n") + with pytest.raises(FileNotFoundError): + process_files(str(source), str(tmp_path / "nonexistent.txt")) + + +# --------------------------------------------------------------------------- +# split_mpa — t__ rank filter (intermediate terminal nodes) +# --------------------------------------------------------------------------- + + +def test_split_mpa_filters_terminal_rank_nodes(tmp_path): + combined = tmp_path / "COMBINED.txt" + combined.write_text( + "#Classification\tsample1\n" + "d__Bacteria|p__Pseudomonadota|s__Pseudomonas_aeruginosa\t300\n" + "d__Bacteria|p__Pseudomonadota|s__Pseudomonas_aeruginosa|t__strain_X\t10\n" + ) + split_mpa(str(combined), str(tmp_path / "out")) + species = (tmp_path / "out" / "txt" / "counts_species.txt").read_text() + assert "t__" not in species + + +# --------------------------------------------------------------------------- +# _is_processable — hidden files, null bytes, non-UTF-8 +# --------------------------------------------------------------------------- + + +def test_is_processable_hidden_file(tmp_path): + f = tmp_path / ".hidden" + f.write_text("content") + assert not _is_processable(f) + + +def test_is_processable_null_bytes(tmp_path): + f = tmp_path / "binary.bin" + f.write_bytes(b"hello\x00world") + assert not _is_processable(f) + + +def test_is_processable_non_utf8(tmp_path): + f = tmp_path / "latin1.txt" + f.write_bytes(b"\xff\xfe bad encoding") + assert not _is_processable(f) + + +def test_is_processable_valid_kreport(tmp_path): + f = tmp_path / "sample.kreport" + f.write_text("50.0\t500\t100\tS\t1\tBacteria\n") + assert _is_processable(f) diff --git a/tests/test_units.py b/tests/test_units.py index 39435d3..414d87d 100644 --- a/tests/test_units.py +++ b/tests/test_units.py @@ -1,12 +1,15 @@ """Pure-function unit tests — no I/O, fully deterministic.""" import math +from pathlib import Path import pytest from krakenparser.counts.processing_script import modify_taxa_names +from krakenparser.counts.split_mpa import _strip_path_prefix from krakenparser.mpa.transform2mpa import _parse_line from krakenparser.stats.diversity import chao1_index, pielou_evenness, shannon_index +from krakenparser.utils import ensure_output_dir # --------------------------------------------------------------------------- # _parse_line @@ -147,3 +150,52 @@ def test_modify_taxa_names_count_fields_not_modified(): # Underscores in tab-separated count fields must be preserved result = modify_taxa_names("s__My_taxon\t1_000\t2_000") assert result == "My taxon\t1_000\t2_000" + + +# --------------------------------------------------------------------------- +# _strip_path_prefix +# --------------------------------------------------------------------------- + + +def test_strip_path_prefix_tab_less_line(): + assert _strip_path_prefix("no_tab_here") == "no_tab_here" + + +def test_strip_path_prefix_normal(): + assert ( + _strip_path_prefix("d__Bacteria|s__E_coli\t100\t200") == "s__E_coli\t100\t200" + ) + + +# --------------------------------------------------------------------------- +# ensure_output_dir +# --------------------------------------------------------------------------- + + +def test_ensure_output_dir_file_creates_parent(tmp_path): + p = ensure_output_dir(tmp_path / "subdir" / "output.csv", is_file=True) + assert (tmp_path / "subdir").is_dir() + assert not p.exists() # only the parent is created, not the file itself + + +def test_ensure_output_dir_dir_creates_directory(tmp_path): + p = ensure_output_dir(tmp_path / "output_dir", is_file=False) + assert p.is_dir() + + +def test_ensure_output_dir_nested_creates_all_parents(tmp_path): + p = ensure_output_dir(tmp_path / "a" / "b" / "c", is_file=False) + assert p.is_dir() + + +def test_ensure_output_dir_returns_path_object(tmp_path): + p = ensure_output_dir(str(tmp_path / "out.csv"), is_file=True) + assert isinstance(p, Path) + + +def test_ensure_output_dir_idempotent_for_existing_dir(tmp_path): + existing = tmp_path / "already_exists" + existing.mkdir() + p = ensure_output_dir(existing, is_file=False) + assert p == existing + assert p.is_dir()