From 234c7d02596e14ccb616a627dd46d2c6ad4527f2 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Mon, 1 Jun 2026 20:18:22 +0200 Subject: [PATCH 01/13] feat(mpa): migrate CLI from argparse to typer --- krakenparser/mpa/mpa_table.py | 68 +++++++++++++++++++++++------------ 1 file changed, 45 insertions(+), 23 deletions(-) diff --git a/krakenparser/mpa/mpa_table.py b/krakenparser/mpa/mpa_table.py index a7cb717..f4a13dc 100644 --- a/krakenparser/mpa/mpa_table.py +++ b/krakenparser/mpa/mpa_table.py @@ -1,17 +1,26 @@ #!/usr/bin/env python """Combine multiple MPA-format files into a single merged table.""" -import argparse import logging +import sys from pathlib import Path +from typing import Optional + +import typer from krakenparser.utils import ensure_output_dir _log = logging.getLogger(__name__) +app = typer.Typer( + name="combine", + add_completion=False, + context_settings={"help_option_names": ["-h", "--help"]}, +) + -def combine_mpa(in_files: list[str], o_file: str) -> None: - out_path = ensure_output_dir(o_file, is_file=True) +def combine_mpa(in_files: list[Path], o_file: Path) -> None: + out_path = ensure_output_dir(str(o_file), is_file=True) samples: dict[int, str] = {} values: dict[str, dict[int, str]] = {} @@ -22,7 +31,7 @@ def combine_mpa(in_files: list[str], o_file: str) -> None: _log.info("Number of files to parse: %d", len(in_files)) for in_path in in_files: - if not Path(in_path).is_file(): + if not in_path.is_file(): raise FileNotFoundError(f"Input file not found: {in_path}") for in_path in in_files: @@ -91,29 +100,42 @@ def combine_mpa(in_files: list[str], o_file: str) -> None: _log.info("%d classifications written", count_written) -def main() -> None: - logging.basicConfig(level=logging.INFO, format="%(message)s") - parser = argparse.ArgumentParser( - description="Combine MPA files into a single tab-delimited table." - ) - parser.add_argument( +@app.callback(invoke_without_command=True) +def main( + ctx: typer.Context, # Контекст для работы с хелпом + in_files: Optional[list[Path]] = typer.Option( + None, "-i", "--input", - required=True, - nargs="+", - dest="in_files", - help="Input MPA files (one per sample)", - ) - parser.add_argument( + help="Input MPA files (one per sample). Повторите флаг -i для каждого файла.", + ), + o_file: Optional[Path] = typer.Option( + None, "-o", "--output", - required=True, - dest="o_file", - help="Output merged MPA file", - ) - args = parser.parse_args() - combine_mpa(args.in_files, args.o_file) + help="Output merged MPA file.", + ), +) -> None: + """Combine MPA files into a single tab-delimited table.""" + logging.basicConfig(level=logging.INFO, format="%(message)s") + + if not in_files and o_file is None: + print(ctx.get_help()) + raise typer.Exit() + + if not in_files or o_file is None: + print( + "Error: Missing required options '-i / --input' and '-o / --output'.", + file=sys.stderr, + ) + raise typer.Exit(code=1) + + try: + combine_mpa(in_files, o_file) + except FileNotFoundError as e: + print(f"Error: {e}", file=sys.stderr) + raise typer.Exit(code=1) if __name__ == "__main__": - main() + app() From abbda6722ad2d90153ff95e8ca98831c0c1cf774 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Mon, 1 Jun 2026 20:19:44 +0200 Subject: [PATCH 02/13] feat(counts): migrate convert2csv CLI from argparse to typer --- krakenparser/counts/convert2csv.py | 57 +++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 17 deletions(-) diff --git a/krakenparser/counts/convert2csv.py b/krakenparser/counts/convert2csv.py index 9fb763e..4b484b1 100755 --- a/krakenparser/counts/convert2csv.py +++ b/krakenparser/counts/convert2csv.py @@ -1,17 +1,24 @@ #!/usr/bin/env python - -import argparse import logging +import sys from pathlib import Path +from typing import Optional import pandas as pd +import typer from krakenparser.utils import ensure_output_dir _log = logging.getLogger(__name__) +app = typer.Typer( + name="csv", + add_completion=False, + context_settings={"help_option_names": ["-h", "--help"]}, +) + -def convert_to_csv(input_file, output_file): +def convert_to_csv(input_file: str, output_file: str) -> None: in_path = Path(input_file) if not in_path.is_file(): raise FileNotFoundError(f"Input file not found: {in_path}") @@ -22,26 +29,42 @@ def convert_to_csv(input_file, output_file): _log.info("Data converted and saved as '%s'.", output_file) -def main() -> None: - logging.basicConfig(level=logging.INFO, format="%(message)s") - parser = argparse.ArgumentParser( - description="Reads a TXT file, reorganizes the data, and converts it into a CSV file." - ) - parser.add_argument( +@app.callback(invoke_without_command=True) +def main( + ctx: typer.Context, # Контекст для нативного хелпа + input_file: Optional[str] = typer.Option( + None, "-i", "--input", - required=True, help="Path to the input TXT file. This file should contain sample names in columns and microbial taxa in rows.", - ) - parser.add_argument( + ), + output_file: Optional[str] = typer.Option( + None, "-o", "--output", - required=True, help="Path to the output CSV file. The script will restructure the data and save it here.", - ) - args = parser.parse_args() - convert_to_csv(args.input, args.output) + ), +) -> None: + """Reads a TXT file, reorganizes the data, and converts it into a CSV file.""" + logging.basicConfig(level=logging.INFO, format="%(message)s") + + if input_file is None and output_file is None: + print(ctx.get_help()) + raise typer.Exit() + + if not input_file or not output_file: + print( + "Error: Missing required options '-i / --input' and '-o / --output'.", + file=sys.stderr, + ) + raise typer.Exit(code=1) + + try: + convert_to_csv(input_file, output_file) + except FileNotFoundError as e: + print(f"Error: {e}", file=sys.stderr) + raise typer.Exit(code=1) if __name__ == "__main__": - main() + app() From fb2df80935f1f20f3d6a0faa97ff1b96cf39a489 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Mon, 1 Jun 2026 20:23:53 +0200 Subject: [PATCH 03/13] feat(counts): migrate processing_script CLI from argparse to typer --- krakenparser/counts/processing_script.py | 59 +++++++++++++++++------- 1 file changed, 42 insertions(+), 17 deletions(-) diff --git a/krakenparser/counts/processing_script.py b/krakenparser/counts/processing_script.py index 0d6d59a..065d25e 100755 --- a/krakenparser/counts/processing_script.py +++ b/krakenparser/counts/processing_script.py @@ -1,15 +1,24 @@ #!/usr/bin/env python -import argparse import logging import os +import sys import tempfile from pathlib import Path +from typing import Optional + +import typer _log = logging.getLogger(__name__) +app = typer.Typer( + name="process", + add_completion=False, + context_settings={"help_option_names": ["-h", "--help"]}, +) + -def modify_taxa_names(line): +def modify_taxa_names(line: str) -> str: prefixes = ["s__", "g__", "f__", "o__", "c__", "p__"] for prefix in prefixes: if line.startswith(prefix): @@ -19,7 +28,7 @@ def modify_taxa_names(line): return line -def process_files(source_file, destination_file): +def process_files(source_file: str, destination_file: str) -> None: src_path = Path(source_file) if not src_path.is_file(): raise FileNotFoundError(f"Source file not found: {src_path}") @@ -53,26 +62,42 @@ def process_files(source_file, destination_file): _log.info(f"Processed {destination_file} successfully.") -def main() -> None: - logging.basicConfig(level=logging.INFO, format="%(message)s") - parser = argparse.ArgumentParser( - description="Reads a source file, processes its first line, modifies taxa names in a destination file, and updates it." - ) - parser.add_argument( +@app.callback(invoke_without_command=True) +def main( + ctx: typer.Context, + input_file: Optional[str] = typer.Option( + None, "-i", "--input", - required=True, help="Path to the source file. This file's first line will be read and modified.", - ) - parser.add_argument( + ), + output_file: Optional[str] = typer.Option( + None, "-o", "--output", - required=True, help="Path to the destination file. This file's contents will be updated with cleaned taxa names.", - ) - args = parser.parse_args() - process_files(args.input, args.output) + ), +) -> None: + """Reads a source file, processes its first line, modifies taxa names in a destination file, and updates it.""" + logging.basicConfig(level=logging.INFO, format="%(message)s") + + if input_file is None and output_file is None: + print(ctx.get_help()) + raise typer.Exit() + + if not input_file or not output_file: + print( + "Error: Missing required options '-i / --input' and '-o / --output'.", + file=sys.stderr, + ) + raise typer.Exit(code=1) + + try: + process_files(input_file, output_file) + except FileNotFoundError as e: + print(f"Error: {e}", file=sys.stderr) + raise typer.Exit(code=1) if __name__ == "__main__": - main() + app() From 54881f41ff8ce556137bf518b27a4d6a75d3521e Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Mon, 1 Jun 2026 20:25:19 +0200 Subject: [PATCH 04/13] feat(counts): migrate split_mpa to typer and add domain filtering options --- krakenparser/counts/split_mpa.py | 154 +++++++++++++++++++++++-------- 1 file changed, 113 insertions(+), 41 deletions(-) diff --git a/krakenparser/counts/split_mpa.py b/krakenparser/counts/split_mpa.py index 1af110e..ac90ef1 100644 --- a/krakenparser/counts/split_mpa.py +++ b/krakenparser/counts/split_mpa.py @@ -4,15 +4,23 @@ Replaces decombine.sh and decombine_viruses.sh. """ -import argparse import logging import re +import sys from pathlib import Path +from typing import Optional + +import typer from krakenparser.utils import ensure_output_dir _log = logging.getLogger(__name__) +app = typer.Typer( + name="split", + add_completion=False, + context_settings={"help_option_names": ["-h", "--help"]}, +) _RANKS = [ ("species", "s__", []), @@ -23,20 +31,21 @@ ("phylum", "p__", ["s__", "g__", "f__", "o__", "c__"]), ] -_HUMAN_TAXA = { - "species": "s__Homo_sapiens", - "genus": "g__Homo", - "family": "f__Hominidae", - "order": "o__Primates", - "class": "c__Mammalia", - "phylum": "p__Chordata", -} +_HUMAN_MARKERS = frozenset( + [ + "s__Homo_sapiens", + "g__Homo", + "f__Hominidae", + "o__Primates", + "c__Mammalia", + "p__Chordata", + ] +) _ACCESSION_RE = re.compile(r"(SRS|SRR|SRX|ERS|ERR|ERX|DRS|DRR|DRX)\d*-") def _strip_path_prefix(line: str) -> str: - """'d__X|p__Y|s__Z\t10\t20' → 's__Z\t10\t20'""" tab = line.find("\t") if tab == -1: return line @@ -46,10 +55,20 @@ def _strip_path_prefix(line: str) -> str: return _ACCESSION_RE.sub("", segment + rest) +def _human_in_line(line: str) -> bool: + tab = line.find("\t") + path = line[:tab] if tab != -1 else line + segments = set(path.split("|")) + return bool(segments & _HUMAN_MARKERS) + + def split_mpa( input_file: str, output_dir: str, viruses_only: bool = False, + bacteria_only: bool = False, + fungi_only: bool = False, + archaea_only: bool = False, keep_human: bool = False, ) -> None: in_path = Path(input_file) @@ -58,17 +77,28 @@ def split_mpa( out_path = ensure_output_dir(output_dir, is_file=False) (out_path / "txt").mkdir(exist_ok=True) - lines = in_path.read_text().splitlines() - data_lines = [ln for ln in lines if not ln.startswith("#") and ln.strip()] + all_lines = [ + ln + for ln in in_path.read_text().splitlines() + if not ln.startswith("#") and ln.strip() + ] + data_lines = all_lines.copy() if viruses_only: data_lines = [ln for ln in data_lines if "d__Viruses" in ln] + if bacteria_only: + data_lines = [ln for ln in data_lines if "d__Bacteria" in ln] + if fungi_only: + data_lines = [ln for ln in data_lines if "k__Fungi" in ln] + if archaea_only: + data_lines = [ln for ln in data_lines if "d__Archaea" in ln] - filter_human = not keep_human and not viruses_only + if keep_human: + human_lines = [ln for ln in all_lines if _human_in_line(ln)] + data_lines = list(dict.fromkeys(data_lines + human_lines)) for rank_name, rank_prefix, exclude_prefixes in _RANKS: result = [] - human_pattern = _HUMAN_TAXA[rank_name] for line in data_lines: if rank_prefix not in line: @@ -77,7 +107,7 @@ def split_mpa( continue if any(ep in line for ep in exclude_prefixes): continue - if filter_human and human_pattern in line: + if not keep_human and _human_in_line(line): continue result.append(_strip_path_prefix(line)) @@ -87,33 +117,75 @@ def split_mpa( _log.info("MPA file split successfully. Output stored in %s", output_dir) -def main() -> None: - logging.basicConfig(level=logging.INFO, format="%(message)s") - parser = argparse.ArgumentParser( - description="Split a combined MPA table into per-rank TXT files." - ) - parser.add_argument("-i", "--input", required=True, help="Input combined MPA file") - parser.add_argument("-o", "--output", required=True, help="Output directory") - parser.add_argument( - "--viruses-only", - action="store_true", - default=False, - help="Extract only Viruses domain taxa", - ) - parser.add_argument( +@app.callback(invoke_without_command=True) +def main( + ctx: typer.Context, + input_file: Optional[str] = typer.Option( + None, + "-i", + "--input", + help="Input combined MPA file.", + ), + output_dir: Optional[str] = typer.Option( + None, + "-o", + "--output", + help="Output directory.", + ), + viruses_only: bool = typer.Option( + False, + "--viruses", + help="Extract only VIRUSES domain taxa.", + ), + bacteria_only: bool = typer.Option( + False, + "--bacteria", + help="Extract only BACTERIA domain taxa.", + ), + fungi_only: bool = typer.Option( + False, + "--fungi", + help="Extract only FUNGI kingdom taxa.", + ), + archaea_only: bool = typer.Option( + False, + "--archaea", + help="Extract only ARCHAEA domain taxa.", + ), + keep_human: bool = typer.Option( + False, "--keep-human", - action="store_true", - default=False, - help="Do not filter human-related taxa (default: filtered)", - ) - args = parser.parse_args() - split_mpa( - args.input, - args.output, - viruses_only=args.viruses_only, - keep_human=args.keep_human, - ) + help="Retain human-related taxa (default: filtered out).", + ), +) -> None: + """Split a combined MPA table into per-rank TXT files.""" + logging.basicConfig(level=logging.INFO, format="%(message)s") + + if input_file is None and output_dir is None: + print(ctx.get_help()) + raise typer.Exit() + + if not input_file or not output_dir: + print( + "Error: Missing required options '-i / --input' and '-o / --output'.", + file=sys.stderr, + ) + raise typer.Exit(code=1) + + try: + split_mpa( + input_file, + output_dir, + viruses_only=viruses_only, + bacteria_only=bacteria_only, + fungi_only=fungi_only, + archaea_only=archaea_only, + keep_human=keep_human, + ) + except FileNotFoundError as e: + print(f"Error: {e}", file=sys.stderr) + raise typer.Exit(code=1) if __name__ == "__main__": - main() + app() From fb2aeffeb7f3c1386093db4d6bc22b3006fc4e9c Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Mon, 1 Jun 2026 20:28:43 +0200 Subject: [PATCH 05/13] feat(mpa): migrate transform2mpa CLI to typer and update path typing --- krakenparser/mpa/transform2mpa.py | 197 +++++++++++++++--------------- 1 file changed, 98 insertions(+), 99 deletions(-) diff --git a/krakenparser/mpa/transform2mpa.py b/krakenparser/mpa/transform2mpa.py index cf0bdae..1ec5fd7 100644 --- a/krakenparser/mpa/transform2mpa.py +++ b/krakenparser/mpa/transform2mpa.py @@ -1,22 +1,29 @@ #!/usr/bin/env python """Convert a Kraken2 report to MetaPhlAn (MPA) format.""" -import argparse import logging import os import sys from pathlib import Path +from typing import Optional + +import typer from krakenparser.utils import ensure_output_dir _log = logging.getLogger(__name__) +app = typer.Typer( + name="mpa", + add_completion=False, + context_settings={"help_option_names": ["-h", "--help"]}, +) + _MAIN_LVLS = {"R", "K", "D", "P", "C", "O", "F", "G", "S"} def _parse_line(line: str, remove_spaces: bool = False) -> list: - """ - Parse one Kraken2 report line. + """Parse one Kraken2 report line. Returns [name, level_num, level_type, all_reads, percents] or empty list on malformed input. @@ -71,23 +78,17 @@ def _parse_line(line: str, remove_spaces: bool = False) -> list: def kreport_to_mpa( - report_path: str, - output_path: str, + report_path: Path, + output_path: Path, display_header: bool = False, include_intermediate: bool = False, use_reads: bool = True, remove_spaces: bool = True, ) -> None: - """ - Convert a single Kraken2 report to MPA format. - - Tracks the current taxonomic path via curr_path and prev_lvl_num, - popping the stack when moving back up the tree — exactly as the - original script does. - """ - if not Path(report_path).is_file(): + """Convert a single Kraken2 report to MPA format.""" + if not report_path.is_file(): raise FileNotFoundError(f"Input file not found: {report_path}") - out_path = ensure_output_dir(output_path, is_file=True) + out_path = ensure_output_dir(str(output_path), is_file=True) curr_path: list[str] = [] prev_lvl_num = -1 @@ -103,11 +104,9 @@ def kreport_to_mpa( name, level_num, level_type, all_reads, percents = report_vals - # Пропускаем unclassified if level_type == "U": continue - # Нормализуем тип уровня if level_type not in _MAIN_LVLS: level_type = "x" elif level_type == "K": @@ -140,104 +139,104 @@ def kreport_to_mpa( prev_lvl_num = level_num -def main() -> None: - logging.basicConfig(level=logging.INFO, format="%(message)s") - parser = argparse.ArgumentParser( - description="Convert a Kraken2 report to MetaPhlAn (MPA) format." - ) - - mode = parser.add_mutually_exclusive_group(required=True) - mode.add_argument( +@app.callback(invoke_without_command=True) +def main( + ctx: typer.Context, + r_file: Optional[Path] = typer.Option( + None, "-r", "--report-file", "--report", - dest="r_file", - help="Single input Kraken2 report file", - ) - mode.add_argument( + help="Single input Kraken2 report file.", + ), + input_dir: Optional[Path] = typer.Option( + None, "-i", "--input", - dest="input_dir", - help="Input directory containing Kraken2 report files (batch mode)", - ) - parser.add_argument( + help="Input directory containing Kraken2 report files (batch mode).", + ), + o_file: Optional[Path] = typer.Option( + None, "-o", "--output", - required=True, - dest="o_file", - help="Output MPA file (single mode) or output directory (batch mode)", - ) - parser.add_argument( + help="Output MPA file (single mode) or output directory (batch mode).", + ), + display_header: bool = typer.Option( + False, "--display-header", - action="store_true", - dest="add_header", - default=False, - help="Write a header line with the sample name (filename)", - ) - parser.add_argument( - "--read_count", - action="store_true", - dest="use_reads", - default=True, - help="Output clade read counts [default]", - ) - parser.add_argument( + help="Write a header line with the sample name (filename).", + ), + percentages: bool = typer.Option( + False, "--percentages", - action="store_false", - dest="use_reads", - help="Output percentages instead of read counts", - ) - parser.add_argument( + help="Output percentages instead of read counts.", + ), + intermediate_ranks: bool = typer.Option( + False, "--intermediate-ranks", - action="store_true", - dest="x_include", - default=False, - help="Include non-standard taxonomic ranks in output", - ) - parser.add_argument( - "--no-intermediate-ranks", - action="store_false", - dest="x_include", - help="Exclude non-standard taxonomic ranks [default]", - ) - group = parser.add_mutually_exclusive_group() - group.add_argument( - "--remove-spaces", - action="store_true", - dest="remove_spaces", - default=True, - help="Replace spaces with underscores in taxon names [default]", - ) - group.add_argument( + help="Include non-standard taxonomic ranks in output.", + ), + keep_spaces: bool = typer.Option( + False, "--keep-spaces", - action="store_false", - dest="remove_spaces", - help="Keep spaces in taxon names", - ) - args = parser.parse_args() + help="Keep spaces in taxon names instead of replacing them with underscores.", + ), +) -> None: + """Convert a Kraken2 report to MetaPhlAn (MPA) format.""" + logging.basicConfig(level=logging.INFO, format="%(message)s") + + if r_file is None and input_dir is None and o_file is None: + print(ctx.get_help()) + raise typer.Exit() + + if o_file is None: + print("Error: Missing required option '-o / --output'.", file=sys.stderr) + raise typer.Exit(code=1) + + if r_file is None and input_dir is None: + print( + "Error: Either -r/--report-file or -i/--input must be provided.", + file=sys.stderr, + ) + raise typer.Exit(code=1) + + if r_file is not None and input_dir is not None: + print( + "Error: Cannot use both -r/--report-file and -i/--input simultaneously.", + file=sys.stderr, + ) + raise typer.Exit(code=1) + + use_reads = not percentages + remove_spaces = not keep_spaces kwargs = dict( - display_header=args.add_header, - include_intermediate=args.x_include, - use_reads=args.use_reads, - remove_spaces=args.remove_spaces, + display_header=display_header, + include_intermediate=intermediate_ranks, + use_reads=use_reads, + remove_spaces=remove_spaces, ) - if args.input_dir: - input_dir = Path(args.input_dir) - if not input_dir.is_dir(): - sys.exit(f"Error: input directory not found: {input_dir}") - output_dir = Path(args.o_file) - output_dir.mkdir(parents=True, exist_ok=True) - for f in sorted(input_dir.iterdir()): - if not f.is_file(): - continue - out_name = f.name.replace(".kreport", ".MPA.TXT") - kreport_to_mpa(str(f), str(output_dir / out_name), **kwargs) - _log.info("Converted to MPA successfully. Output stored in %s", output_dir) - else: - kreport_to_mpa(args.r_file, args.o_file, **kwargs) + try: + if input_dir: + if not input_dir.is_dir(): + print(f"Error: input directory not found: {input_dir}", file=sys.stderr) + raise typer.Exit(code=1) + + o_file.mkdir(parents=True, exist_ok=True) + for f in sorted(input_dir.iterdir()): + if not f.is_file(): + continue + out_name = f.name.replace(".kreport", ".MPA.TXT") + kreport_to_mpa(f, o_file / out_name, **kwargs) + _log.info("Converted to MPA successfully. Output stored in %s", o_file) + else: + kreport_to_mpa(r_file, o_file, **kwargs) + + except FileNotFoundError as e: + print(f"Error: {e}", file=sys.stderr) + raise typer.Exit(code=1) if __name__ == "__main__": - main() + app() From b597b9b5c64bb96636454f88d5a82eabefd578a5 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Mon, 1 Jun 2026 20:29:30 +0200 Subject: [PATCH 06/13] feat(stats): migrate diversity CLI from argparse to typer --- krakenparser/stats/diversity.py | 95 +++++++++++++++++++++------------ 1 file changed, 62 insertions(+), 33 deletions(-) diff --git a/krakenparser/stats/diversity.py b/krakenparser/stats/diversity.py index 4977f55..ef276d9 100644 --- a/krakenparser/stats/diversity.py +++ b/krakenparser/stats/diversity.py @@ -1,18 +1,25 @@ #!/usr/bin/env python -import argparse import logging import sys from pathlib import Path +from typing import Optional import numpy as np import pandas as pd +import typer from scipy.spatial.distance import pdist, squareform from krakenparser.utils import ensure_output_dir _log = logging.getLogger(__name__) +app = typer.Typer( + name="diversity", + add_completion=False, + context_settings={"help_option_names": ["-h", "--help"]}, +) + def shannon_index(counts): counts = np.array(counts) @@ -48,8 +55,8 @@ def _subsample_counts( return np.bincount(sampled, minlength=len(counts)).astype(int) -def calc_alpha_div(df, output_path): - out_path = ensure_output_dir(output_path, is_file=False) +def calc_alpha_div(df: pd.DataFrame, output_path: Path) -> None: + out_path = ensure_output_dir(str(output_path), is_file=False) results = [] for sample_id, row in df.iterrows(): counts = row.values @@ -69,8 +76,13 @@ def calc_alpha_div(df, output_path): ) -def calc_beta_div(df, output_path, rarefaction_depth, seed=None): - out_path = ensure_output_dir(output_path, is_file=False) +def calc_beta_div( + df: pd.DataFrame, + output_path: Path, + rarefaction_depth: int, + seed: Optional[int] = None, +) -> None: + out_path = ensure_output_dir(str(output_path), is_file=False) rng = np.random.default_rng(seed) rarefied_counts: list[np.ndarray] = [] sample_ids: list[str] = [] @@ -107,50 +119,67 @@ def calc_beta_div(df, output_path, rarefaction_depth, seed=None): ) -def main() -> None: - logging.basicConfig(level=logging.INFO, format="%(message)s") - parser = argparse.ArgumentParser(description="Calculate α & β-diversities.") - parser.add_argument( +@app.callback(invoke_without_command=True) +def main( + ctx: typer.Context, + input_file: Optional[Path] = typer.Option( + None, "-i", "--input", - required=True, help="Input total count table CSV (species level).", - ) - parser.add_argument("-o", "--output", required=True, help="Output directory path.") - parser.add_argument( + ), + output_dir: Optional[Path] = typer.Option( + None, + "-o", + "--output", + help="Output directory path.", + ), + depth: int = typer.Option( + 1000, "-d", "--depth", - type=int, - default=1000, - help="Rarefaction depth for β diversity (default: 1000).", - ) - parser.add_argument( + help="Rarefaction depth for β diversity.", + ), + seed: Optional[int] = typer.Option( + None, "-s", "--seed", - type=int, - default=None, help="Random seed for reproducible rarefaction (default: random).", - ) - args = parser.parse_args() + ), +) -> None: + """Calculate α & β-diversities for microbial communities.""" + logging.basicConfig(level=logging.INFO, format="%(message)s") + + if input_file is None and output_dir is None: + print(ctx.get_help()) + raise typer.Exit() + + if not input_file or not output_dir: + print( + "Error: Missing required options '-i / --input' and '-o / --output'.", + file=sys.stderr, + ) + raise typer.Exit(code=1) seed_label = ( - str(args.seed) - if args.seed is not None - else "not set (results will vary between runs)" + str(seed) if seed is not None else "not set (results will vary between runs)" ) - _log.info("Rarefaction depth: %d | seed: %s", args.depth, seed_label) + _log.info("Rarefaction depth: %d | seed: %s", depth, seed_label) - input_file = Path(args.input) if not input_file.is_file(): - sys.exit(f"Error: input file not found: {input_file}") - output_dir = Path(args.output) - output_dir.mkdir(parents=True, exist_ok=True) + print(f"Error: input file not found: {input_file}", file=sys.stderr) + raise typer.Exit(code=1) + output_dir.mkdir(parents=True, exist_ok=True) df = pd.read_csv(input_file, index_col=0) - calc_alpha_div(df, output_dir) - calc_beta_div(df, output_dir, args.depth, seed=args.seed) + try: + calc_alpha_div(df, output_dir) + calc_beta_div(df, output_dir, depth, seed=seed) + except ValueError as e: + print(f"Error: {e}", file=sys.stderr) + raise typer.Exit(code=1) if __name__ == "__main__": - main() + app() From 268dbdecda28ad9369966b8e0b5ead243de6a534 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Mon, 1 Jun 2026 20:30:23 +0200 Subject: [PATCH 07/13] feat(stats): migrate relabund CLI from argparse to typer --- krakenparser/stats/relabund.py | 72 ++++++++++++++++++++++++---------- 1 file changed, 52 insertions(+), 20 deletions(-) diff --git a/krakenparser/stats/relabund.py b/krakenparser/stats/relabund.py index 0957329..ee924d9 100644 --- a/krakenparser/stats/relabund.py +++ b/krakenparser/stats/relabund.py @@ -1,22 +1,32 @@ #!/usr/bin/env python -import argparse import logging +import sys import warnings from pathlib import Path +from typing import Optional import pandas as pd +import typer from krakenparser.utils import ensure_output_dir _log = logging.getLogger(__name__) +app = typer.Typer( + name="relabund", + add_completion=False, + context_settings={"help_option_names": ["-h", "--help"]}, +) -def calculate_rel_abund(input_file, output_file, other_threshold=None): + +def calculate_rel_abund( + input_file: Path, output_file: Path, other_threshold: Optional[float] = None +) -> None: in_path = Path(input_file) if not in_path.is_file(): raise FileNotFoundError(f"Input file not found: {in_path}") - out_path = ensure_output_dir(output_file, is_file=True) + out_path = ensure_output_dir(str(output_file), is_file=True) # Load counts table df = pd.read_csv(in_path) @@ -63,26 +73,48 @@ def calculate_rel_abund(input_file, output_file, other_threshold=None): _log.info("Relative abundance saved as '%s'.", output_file) -def main() -> None: - logging.basicConfig(level=logging.INFO, format="%(message)s") - parser = argparse.ArgumentParser( - description="Calculates taxa relative abundance and saves it to a CSV file." - ) - parser.add_argument( - "-i", "--input", required=True, help="Input CSV file with counts." - ) - parser.add_argument("-o", "--output", required=True, help="Output CSV file path.") - parser.add_argument( +@app.callback(invoke_without_command=True) +def main( + ctx: typer.Context, # Контекст для вызова хелпа + input_file: Optional[Path] = typer.Option( + None, + "-i", + "--input", + help="Input CSV file with counts.", + ), + output_file: Optional[Path] = typer.Option( + None, + "-o", + "--output", + help="Output CSV file path.", + ), + other: Optional[float] = typer.Option( + None, "-O", "--other", - type=float, - default=None, - help="Threshold for grouping taxa into 'Other ( None: + """Calculates taxa relative abundance and saves it to a CSV file.""" + logging.basicConfig(level=logging.INFO, format="%(message)s") + + if input_file is None and output_file is None: + print(ctx.get_help()) + raise typer.Exit() + + if not input_file or not output_file: + print( + "Error: Missing required options '-i / --input' and '-o / --output'.", + file=sys.stderr, + ) + raise typer.Exit(code=1) - args = parser.parse_args() - calculate_rel_abund(args.input, args.output, args.other) + try: + calculate_rel_abund(input_file, output_file, other) + except FileNotFoundError as e: + print(f"Error: {e}", file=sys.stderr) + raise typer.Exit(code=1) if __name__ == "__main__": - main() + app() From 19fc6a3503caf24dfcd4544f5a16bf7c9bb40010 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Mon, 1 Jun 2026 20:31:32 +0200 Subject: [PATCH 08/13] feat(pipeline): migrate main entrypoint to typer and expose domain filters --- krakenparser/pipeline.py | 154 +++++++++++++++++++++++---------------- 1 file changed, 90 insertions(+), 64 deletions(-) diff --git a/krakenparser/pipeline.py b/krakenparser/pipeline.py index 5bafa51..5a3a5aa 100644 --- a/krakenparser/pipeline.py +++ b/krakenparser/pipeline.py @@ -1,13 +1,14 @@ #!/usr/bin/env python """Full KrakenParser pipeline: kreport → MPA → combined → counts → rel_abund → diversity.""" -import argparse import logging import shutil import sys from pathlib import Path +from typing import Optional import pandas as pd +import typer from krakenparser.counts.convert2csv import convert_to_csv from krakenparser.counts.processing_script import process_files @@ -19,6 +20,12 @@ _log = logging.getLogger(__name__) +app = typer.Typer( + name="run", + add_completion=False, + context_settings={"help_option_names": ["-h", "--help"]}, +) + def _is_processable(path: Path) -> bool: """Return False for hidden files, files with null bytes, or non-UTF-8 files.""" @@ -38,19 +45,22 @@ def _is_processable(path: Path) -> bool: def run_pipeline( - input_dir: str, - output_dir: str | None = None, + input_dir: Path, + output_dir: Optional[Path] = None, keep_human: bool = False, viruses_only: bool = False, + bacteria_only: bool = False, + fungi_only: bool = False, + archaea_only: bool = False, rarefaction_depth: int = 1000, - seed: int | None = None, + seed: Optional[int] = None, overwrite: bool = False, ) -> None: - source_dir = Path(input_dir) + source_dir = input_dir if not source_dir.is_dir(): raise FileNotFoundError(f"Input directory not found: {source_dir}") - out_dir = Path(output_dir) if output_dir else source_dir.parent + out_dir = output_dir if output_dir else source_dir.parent out_dir.mkdir(parents=True, exist_ok=True) existing = [out_dir / d for d in _OUTPUT_SUBDIRS if (out_dir / d).exists()] @@ -68,7 +78,6 @@ def run_pipeline( intermediate_dir = out_dir / "intermediate" intermediate_dir.mkdir(exist_ok=True) - # Part 1: kreport → MPA mpa_dir = intermediate_dir / "mpa" mpa_dir.mkdir(exist_ok=True) for f in sorted(source_dir.iterdir()): @@ -78,47 +87,46 @@ def run_pipeline( _log.info("Skipping: %s", f.name) continue out_name = f.stem + ".MPA.TXT" - kreport_to_mpa(str(f), str(mpa_dir / out_name), display_header=True) + kreport_to_mpa(f, mpa_dir / out_name, display_header=True) - # Part 2: combine MPAs mpa_files = sorted(mpa_dir.glob("*.MPA.TXT")) if not mpa_files: - sys.exit("Error: no MPA files found after conversion.") + print("Error: no MPA files found after conversion.", file=sys.stderr) + raise typer.Exit(code=1) combined_file = intermediate_dir / "COMBINED.txt" - combine_mpa([str(f) for f in mpa_files], str(combined_file)) + combine_mpa(mpa_files, combined_file) _log.info("MPA files combined. Output: %s", combined_file) - # Part 3: split combined MPA by rank split_mpa( str(combined_file), str(intermediate_dir), keep_human=keep_human, viruses_only=viruses_only, + bacteria_only=bacteria_only, + fungi_only=fungi_only, + archaea_only=archaea_only, ) txt_dir = intermediate_dir / "txt" - # Part 4: clean taxa names and add sample header for txt_file in sorted(txt_dir.glob("counts_*.txt")): process_files(str(combined_file), str(txt_file)) - # Part 5: TXT → CSV counts_dir = out_dir / "counts" counts_dir.mkdir(exist_ok=True) for txt_file in sorted(txt_dir.glob("counts_*.txt")): csv_file = counts_dir / txt_file.with_suffix(".csv").name convert_to_csv(str(txt_file), str(csv_file)) - # Part 6: relative abundance rel_abund_dir = out_dir / "rel_abund" rel_abund_dir.mkdir(exist_ok=True) for csv_file in sorted(counts_dir.glob("counts_*.csv")): ra_file = rel_abund_dir / csv_file.name.replace("counts_", "ra_") - calculate_rel_abund(str(csv_file), str(ra_file)) + calculate_rel_abund(csv_file, ra_file) - # Part 7: α & β-diversities species_csv = counts_dir / "counts_species.csv" if not species_csv.exists(): - sys.exit(f"Error: species counts not found: {species_csv}") + print(f"Error: species counts not found: {species_csv}", file=sys.stderr) + raise typer.Exit(code=1) diversity_dir = out_dir / "diversity" diversity_dir.mkdir(exist_ok=True) df = pd.read_csv(species_csv, index_col=0) @@ -128,67 +136,85 @@ def run_pipeline( _log.info("All steps completed successfully!") -def main() -> None: - logging.basicConfig(level=logging.INFO, format="%(message)s") - parser = argparse.ArgumentParser(description="Run the full KrakenParser pipeline.") - parser.add_argument( +@app.command( + help="Run the full KrakenParser pipeline.", + no_args_is_help=True, +) +def main( + input_dir: Path = typer.Option( + ..., "-i", "--input", - required=True, - help="Directory containing Kraken2 report files", - ) - parser.add_argument( + help="Directory containing Kraken2 report files.", + ), + output_dir: Optional[Path] = typer.Option( + None, "-o", "--output", - default=None, - help="Output directory (default: parent of input)", - ) - parser.add_argument( + help="Output directory (default: parent of input).", + ), + keep_human: bool = typer.Option( + False, "--keep-human", - action="store_true", - default=False, - help="Do not filter human-related taxa (default: filtered)", - ) - parser.add_argument( + help="Do not filter human-related taxa (default: filtered).", + ), + viruses: bool = typer.Option( + False, "--viruses", - action="store_true", - default=False, - help="Extract only Viruses domain taxa in the pipeline", - ) - parser.add_argument( + help="Extract only Viruses domain taxa.", + ), + bacteria: bool = typer.Option( + False, + "--bacteria", + help="Extract only Bacteria domain taxa.", + ), + fungi: bool = typer.Option( + False, + "--fungi", + help="Extract only Fungi kingdom taxa.", + ), + archaea: bool = typer.Option( + False, + "--archaea", + help="Extract only Archaea domain taxa.", + ), + depth: int = typer.Option( + 1000, "-d", "--depth", - type=int, - default=1000, - help="Rarefaction depth for β-diversity (default: 1000)", - ) - parser.add_argument( + help="Rarefaction depth for β-diversity.", + ), + seed: Optional[int] = typer.Option( + None, "-s", "--seed", - type=int, - default=None, - help="Random seed for reproducible rarefaction (default: random)", - ) - parser.add_argument( + help="Random seed for reproducible rarefaction (default: random).", + ), + overwrite: bool = typer.Option( + False, "--overwrite", - action="store_true", - default=False, - help="Overwrite the output directory if it already exists", - ) - args = parser.parse_args() + help="Overwrite the output directory if it already exists.", + ), +) -> None: + logging.basicConfig(level=logging.INFO, format="%(message)s") + try: run_pipeline( - args.input, - args.output, - keep_human=args.keep_human, - viruses_only=args.viruses, - rarefaction_depth=args.depth, - seed=args.seed, - overwrite=args.overwrite, + input_dir=input_dir, + output_dir=output_dir, + keep_human=keep_human, + viruses_only=viruses, + bacteria_only=bacteria, + fungi_only=fungi, + archaea_only=archaea, + rarefaction_depth=depth, + seed=seed, + overwrite=overwrite, ) except (FileNotFoundError, FileExistsError) as e: - sys.exit(f"Error: {e}") + print(f"Error: {e}", file=sys.stderr) + raise typer.Exit(code=1) if __name__ == "__main__": - main() + app() From 18e9a9b840e309c0173ad029f499e9b34442ea16 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Mon, 1 Jun 2026 20:32:44 +0200 Subject: [PATCH 09/13] feat(cli): refactor core entrypoint to native Typer subcommands --- krakenparser/krakenparser.py | 342 ++++++++++++++++------------------- 1 file changed, 157 insertions(+), 185 deletions(-) diff --git a/krakenparser/krakenparser.py b/krakenparser/krakenparser.py index 523a3f5..7967da9 100755 --- a/krakenparser/krakenparser.py +++ b/krakenparser/krakenparser.py @@ -1,227 +1,199 @@ -import argparse +#!/usr/bin/env python +"""KrakenParser: Convert Kraken2 Reports to CSV and analyze microbial diversity. + +Built with native Typer subcommands while preserving a direct root interface +for the full pipeline execution without the 'run' keyword. +""" + import logging -import subprocess import sys from importlib.metadata import PackageNotFoundError as _PNF from importlib.metadata import version as _pkg_version from pathlib import Path +from typing import Optional + +import typer + +from krakenparser.counts.convert2csv import app as csv_app +from krakenparser.counts.processing_script import app as process_app +from krakenparser.counts.split_mpa import app as split_app +from krakenparser.mpa.mpa_table import app as combine_app +from krakenparser.mpa.transform2mpa import app as mpa_app +from krakenparser.pipeline import run_pipeline +from krakenparser.stats.diversity import app as diversity_app +from krakenparser.stats.relabund import app as relabund_app try: __version__ = _pkg_version("krakenparser") except _PNF: __version__ = "unknown" - -def main(): - print("KrakenParser by Ilia V. Popov") - logging.basicConfig(level=logging.INFO, format="%(message)s") - package_dir = Path(__file__).resolve().parent - - # Map of advanced steps for granular pipeline execution control - step_map = { - "mpa": (package_dir / "mpa" / "transform2mpa.py", []), - "combine": (package_dir / "mpa" / "mpa_table.py", []), - "split": (package_dir / "counts" / "split_mpa.py", []), - "process": (package_dir / "counts" / "processing_script.py", []), - "csv": (package_dir / "counts" / "convert2csv.py", []), - "relabund": (package_dir / "stats" / "relabund.py", []), - "diversity": (package_dir / "stats" / "diversity.py", []), - } - - def _build_cmd( - script: Path, base_args: list[str], user_args: list[str] - ) -> list[str]: - if script.suffix == ".py": - # Execute as module to preserve relative imports within the package - module = ".".join( - script.relative_to(package_dir.parent).with_suffix("").parts - ) - return [sys.executable, "-m", module] + base_args + user_args - return [str(script)] + base_args + user_args - - # ------------------------------------------------------------------------- - # 1. Intercept --step execution for sub-module isolation - # ------------------------------------------------------------------------- - if "--step" in sys.argv: - step_idx = sys.argv.index("--step") - if step_idx + 1 < len(sys.argv): - step = sys.argv[step_idx + 1] - if step in step_map: - script, base_args = step_map[step] - passed_args = sys.argv[1:] - passed_args.remove("--step") - passed_args.remove(step) - - cmd = _build_cmd(script, base_args, passed_args) - sys.exit(subprocess.run(cmd).returncode) - - # ------------------------------------------------------------------------- - # 2. Main Argument Parser Definition - # ------------------------------------------------------------------------- - parser = argparse.ArgumentParser( - description="KrakenParser: Convert Kraken2 Reports to CSV.", - formatter_class=argparse.RawTextHelpFormatter, - ) - - core_group = parser.add_argument_group("Core Arguments") - core_group.add_argument( - "-i", "--input", help="Directory containing Kraken2 report files" - ) - core_group.add_argument( - "-o", "--output", help="Output directory (default: parent of input)" - ) - core_group.add_argument( - "--viruses", - action="store_true", - help="Extract only VIRUSES domain taxa in the pipeline", - ) - core_group.add_argument( - "--keep-human", action="store_true", help="Do not filter human-related taxa" - ) - core_group.add_argument( - "-V", "--version", action="version", version=f"%(prog)s {__version__}" - ) - - pipe_group = parser.add_argument_group("Pipeline Options (Full Run)") - pipe_group.add_argument( - "-d", - "--depth", - type=int, - default=1000, - help="Rarefaction depth for β-diversity (default: 1000)", - ) - pipe_group.add_argument( - "-s", - "--seed", - type=int, - help="Random seed for reproducible rarefaction (default: random)", - ) - pipe_group.add_argument( +app = typer.Typer( + add_completion=False, + context_settings={"help_option_names": ["-h", "--help"]}, +) + +PANEL_NAME = "Advanced (Step-by-step pipeline control)" + +app.add_typer(mpa_app, name="mpa", rich_help_panel=PANEL_NAME) +app.add_typer(combine_app, name="combine", rich_help_panel=PANEL_NAME) +app.add_typer(split_app, name="split", rich_help_panel=PANEL_NAME) +app.add_typer(process_app, name="process", rich_help_panel=PANEL_NAME) +app.add_typer(csv_app, name="csv", rich_help_panel=PANEL_NAME) +app.add_typer(relabund_app, name="relabund", rich_help_panel=PANEL_NAME) +app.add_typer(diversity_app, name="diversity", rich_help_panel=PANEL_NAME) + + +def _version_callback(value: bool) -> None: + if value: + print(f"KrakenParser {__version__}") + raise typer.Exit() + + +@app.callback(invoke_without_command=True) +def main_callback( + ctx: typer.Context, + input_dir: Optional[Path] = typer.Option( + None, "-i", "--input", help="Directory containing Kraken2 report files." + ), + output_dir: Optional[Path] = typer.Option( + None, "-o", "--output", help="Output directory." + ), + viruses: bool = typer.Option( + False, "--viruses", help="Extract only VIRUSES domain taxa in the pipeline." + ), + bacteria: bool = typer.Option( + False, "--bacteria", help="Extract only BACTERIA domain taxa in the pipeline." + ), + fungi: bool = typer.Option( + False, "--fungi", help="Extract only FUNGI kingdom taxa in the pipeline." + ), + archaea: bool = typer.Option( + False, "--archaea", help="Extract only ARCHAEA domain taxa in the pipeline." + ), + keep_human: bool = typer.Option( + False, "--keep-human", help="Do not filter human-related taxa." + ), + version: Optional[bool] = typer.Option( + None, + "-V", + "--version", + callback=_version_callback, + is_eager=True, + help="Show version and exit.", + ), + depth: int = typer.Option( + 1000, "-d", "--depth", help="Rarefaction depth for β-diversity." + ), + seed: Optional[int] = typer.Option( + None, "-s", "--seed", help="Random seed for reproducible rarefaction." + ), + overwrite: bool = typer.Option( + False, "--overwrite", - action="store_true", - help="Overwrite the output directory if it already exists", - ) - - adv_group = parser.add_argument_group("Advanced (Step-by-step control)") - adv_group.add_argument( - "--step", - choices=list(step_map.keys()), - help="Run only a specific part of the pipeline.\nType 'krakenparser --step -h' for more.", - ) - - # Suppressed routing flags for strict backwards compatibility - legacy_flags = [ - "--complete", - "--kreport2mpa", - "--combine_mpa", - "--deconstruct", - "--deconstruct_viruses", - "--process", - "--txt2csv", - "--relabund", - "--diversity", - ] - for flag in legacy_flags: - parser.add_argument(flag, action="store_true", help=argparse.SUPPRESS) - - # ------------------------------------------------------------------------- - # 3. Routing Logic and Validation - # ------------------------------------------------------------------------- - for _a in sys.argv: - if "\x00" in _a: - sys.exit("Error: argument contains invalid null byte.") - - args, unknown_args = parser.parse_known_args() - - legacy_map = { - "complete": (package_dir / "pipeline.py", []), - "kreport2mpa": step_map["mpa"], - "combine_mpa": step_map["combine"], - "deconstruct": step_map["split"], - "deconstruct_viruses": ( - package_dir / "counts" / "split_mpa.py", - ["--viruses-only"], - ), - "process": step_map["process"], - "txt2csv": step_map["csv"], - "relabund": step_map["relabund"], - "diversity": step_map["diversity"], - } - - passed_legacy_args = [ - arg - for arg in sys.argv[1:] - if not arg.startswith("--") or arg.lstrip("--") not in legacy_map - ] - - for flag, (script, base_args) in legacy_map.items(): - if getattr(args, flag, False): - cmd = _build_cmd(script, base_args, passed_legacy_args) - sys.exit(subprocess.run(cmd).returncode) - - # Standard entry point: trigger pipeline execution if input directory is provided - if args.input: - script = package_dir / "pipeline.py" - cmd = _build_cmd(script, [], sys.argv[1:]) - - in_path = Path(args.input) - out_path = Path(args.output) if args.output else in_path.parent + help="Overwrite the output directory if it already exists.", + ), +) -> None: + """ + KrakenParser: Convert Kraken2 Reports to CSV and analyze microbial diversity. + + To execute the full pipeline automatically, just use the global options. + + Alternatively, you can run specific parts of the pipeline manually in the following order: + + mpa ➔ combine ➔ split ➔ process ➔ csv ➔ relabund ➔ diversity + + Each step behaves as an independent tool. Type 'krakenparser --help' to see options for a specific step. + """ + + if ctx.invoked_subcommand is not None: + return + + if input_dir: + print("KrakenParser by Ilia V. Popov") + + out_path = output_dir if output_dir else input_dir.parent out_path.mkdir(parents=True, exist_ok=True) log_file_path = out_path / "krakenparser.log" - with open(log_file_path, "w") as log_file: - result = subprocess.run(cmd, stdout=log_file, stderr=subprocess.STDOUT) + log_handler = logging.FileHandler(log_file_path, mode="w") + log_handler.setFormatter(logging.Formatter("%(message)s")) + logging.basicConfig(level=logging.INFO, handlers=[log_handler]) + + try: + run_pipeline( + input_dir=input_dir, + output_dir=output_dir, + keep_human=keep_human, + viruses_only=viruses, + bacteria_only=bacteria, + fungi_only=fungi, + archaea_only=archaea, + rarefaction_depth=depth, + seed=seed, + overwrite=overwrite, + ) + except (FileNotFoundError, FileExistsError) as e: + print(f"Error: {e}", file=sys.stderr) + raise typer.Exit(code=1) - if result.returncode == 0: - print("All steps completed successfully!") - print(f"Logs saved to {log_file_path}") + print("All steps completed successfully!") + print(f"Logs saved to {log_file_path}") - has_depth = any(arg in sys.argv for arg in ["-d", "--depth"]) - has_seed = any(arg in sys.argv for arg in ["-s", "--seed"]) + out_str = out_path.as_posix() - out_str = out_path.as_posix() + has_custom_depth = ( + str(ctx.get_parameter_source("depth")) != "ParameterSource.DEFAULT" + ) + has_custom_seed = ( + str(ctx.get_parameter_source("seed")) != "ParameterSource.DEFAULT" + ) - print("\n" + "=" * 95) + print("\n" + "=" * 95) - if not has_depth and not has_seed: - print( - f""" -[INFO] Pipeline completed using default rarefaction parameters (depth=1000, seed=random). + if not has_custom_depth and not has_custom_seed: + print( + f""" +[INFO] Pipeline completed using default rarefaction parameters (depth={depth}, seed=random). To calibrate beta-diversity sensitivity metrics for this specific dataset, manually execute the diversity sub-module with custom thresholds. Example: - krakenparser --step diversity \\ + krakenparser diversity \\ -i {out_str}/counts/counts_species.csv \\ -o {out_str}/diversity \\ --depth 1500 \\ - --seed 42 - """.rstrip() - ) + --seed 42""".rstrip() + ) - print( - f""" + print( + f""" [TIP] Downstream Data Visualization Prerequisite: Relative abundance normalization is required to group low-abundance taxa using the -O / --other parameter. Without filtering the 'long tail' of rare taxa, the resulting visualization will suffer from overplotting and significant loss of interpretability. Example: - krakenparser --step relabund \\ + krakenparser relabund \\ -i {out_str}/counts/counts_species.csv \\ -o {out_str}/rel_abund/counts_species_relabund_3_5.csv \\ -O 3.5 -{"=" * 95} - """.rstrip() - ) - else: - print(f"Pipeline failed. Check logs at {log_file_path}") +{"=" * 95}""".rstrip() + ) + + raise typer.Exit() + + print("KrakenParser by Ilia V. Popov") + print(ctx.get_help()) - sys.exit(result.returncode) - # Fallback to usage overview if no actionable arguments were provided - parser.print_help() +def entry_point() -> None: + try: + app() + except KeyboardInterrupt: + print("\nExecution interrupted by user.", file=sys.stderr) + sys.exit(1) if __name__ == "__main__": - main() + entry_point() From 83cd249c23a2c3b81fd0c5f068dc1b6a81ff6f9a Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Mon, 1 Jun 2026 20:33:21 +0200 Subject: [PATCH 10/13] chore(release): bump version to 1.1.0 and register Typer entry points --- pyproject.toml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0d777b1..8ca695b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "krakenparser" -version = "1.0.1" +version = "1.1.0" description = "A collection of scripts designed to process Kraken2 reports and convert them into CSV format." readme = {file = "README.md", content-type = "text/markdown"} license = {file = "LICENSE"} @@ -16,6 +16,7 @@ dependencies = [ "numpy>=1.24.0,<=2.3.5", "seaborn>=0.12.0,<=0.13.2", "scipy>=1.9.0,<=1.16.3", + "typer>=0.12.0,<1", ] [project.optional-dependencies] @@ -28,7 +29,8 @@ dev = [ Homepage = "https://github.com/PopovIILab/KrakenParser" [project.scripts] -KrakenParser = "krakenparser.krakenparser:main" +KrakenParser = "krakenparser.krakenparser:entry_point" +krakenparser = "krakenparser.krakenparser:entry_point" [tool.setuptools.packages.find] where = ["."] From d5279eb4072b2c9abbcf4d4bbb27b928f004811c Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Mon, 1 Jun 2026 20:36:05 +0200 Subject: [PATCH 11/13] docs(readme): polish structure and styling --- README.md | 349 +++++++++++++++++++++++++++--------------------------- 1 file changed, 177 insertions(+), 172 deletions(-) diff --git a/README.md b/README.md index d96b894..3681f86 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,9 @@ ## Overview + KrakenParser is a collection of scripts designed to process Kraken2 reports and convert them into CSV format. This pipeline extracts taxonomic abundance data at six levels: + - **Phylum** - **Class** - **Order** @@ -17,102 +19,25 @@ KrakenParser is a collection of scripts designed to process Kraken2 reports and - **Genus** - **Species** -You can run the entire pipeline with **a single command**, or use the scripts **individually** depending on your needs. - -🔗 Please visit [KrakenParser wiki](https://github.com/PopovIILab/KrakenParser/wiki) page - -## Output example - -### Total abundance output - -`counts_phylum.csv` parsed from 9 kraken2 reports of metagenomic samples using `KrakenParser`: - -``` -Sample_id,Calditrichota,Caldisericota,Thermosulfidibacterota,Elusimicrobiota,Candidatus Fervidibacterota,Lentisphaerota,Kiritimatiellota,Vulcanimicrobiota,Thermodesulfobiota,Atribacterota,Dictyoglomota,Nitrospinota,Chrysiogenota,Coprothermobacterota,Aquificota,Thermotogota,Bdellovibrionota,Nitrospirota,Deferribacterota,Synergistota,Myxococcota,Acidobacteriota,Candidatus Bipolaricaulota,Candidatus Saccharibacteria,Candidatus Absconditabacteria,Fusobacteriota,Spirochaetota,Candidatus Omnitrophota,Chlamydiota,Verrucomicrobiota,Planctomycetota,Thermodesulfobacteriota,Campylobacterota,Candidatus Cloacimonadota,Fibrobacterota,Gemmatimonadota,Balneolota,Rhodothermota,Ignavibacteriota,Chlorobiota,Bacteroidota,Deinococcota,Thermomicrobiota,Armatimonadota,Chloroflexota,Cyanobacteriota,Mycoplasmatota,Actinomycetota,Bacillota,Pseudomonadota,Heterolobosea,Parabasalia,Fornicata,Evosea,Bacillariophyta,Cercozoa,Euglenozoa,Apicomplexa,Microsporidia,Basidiomycota,Ascomycota,Nanoarchaeota,Candidatus Micrarchaeota,Candidatus Thermoplasmatota,Candidatus Lokiarchaeota,Nitrososphaerota,Euryarchaeota,Thermoproteota,Hofneiviricota,Artverviricota,Nucleocytoviricota,Cossaviricota,Kitrinoviricota,Negarnaviricota,Lenarviricota,Pisuviricota,Peploviricota,Uroviricota -X1,0,0,0,0,0,0,0,0,1,1,1,1,2,3,4,5,7,8,9,17,23,25,5,13,22,47,54,1,6,27,31,128,151,2,6,13,1,3,7,44,14991,7,9,11,61,414,449,3551,55304,438645,0,0,0,0,0,0,1,22,0,4,15,0,0,0,0,0,3,191,0,0,1,88,0,0,0,161,0,1241 -X2,1,4,14,20,5,12,15,6,8,15,2,15,109,68,182,97,79,196,70,272,331,149,36,77,35,562,1237,21,33,129,427,1044,543,8,98,25,16,45,11,1043,41374,160,28,161,1348,1196,2709,15864,431170,2747842,22,7,301,373,134,136,107,3239,54,1151,2905,0,0,3,5,6,7,410,0,0,0,736,0,3,11,26,1,1552 -... -X8,1,19,0,47,0,1,6,20,28,0,1,1,47,7,336,110,30,32,10,93,85,48,9,7,7,154,386,0,14,19,106,358,242,14,5,134,15,11,7,18,54057,106,10,24,212,340,1128,16220,567908,650264,95,4,193,402,314,300,187,4376,37,9796,8653,0,1,0,1,5,23,1778,1,1,0,1,1,4,66,30,4,1263 -X9,0,3,2,16,7,1,23,12,10,9,1,2,134,40,390,289,29,372,27,81,150,90,9,88,32,287,881,14,33,60,319,1045,328,15,22,22,10,72,8,63,35301,127,15,48,412,935,2343,11500,380765,2613854,0,0,0,0,0,0,5,74,0,38,40,3,0,0,0,1,3,275,0,0,0,0,0,2,118,25,0,1675 - -``` - -### Relative abundance output - -`ra_phylum.csv` calculated from 9 kraken2 reports of metagenomic samples using `KrakenParser`: - -``` -Sample_id,taxon,rel_abund_perc -X1,Pseudomonadota,85.03558294577552 -X1,Bacillota,10.72121619814011 -X1,Other (<4.0%),4.243200856084384 -X2,Pseudomonadota,84.28702055549813 -X2,Bacillota,13.225663867469137 -X2,Other (<4.0%),2.487315577032736 -... -X8,Pseudomonadota,49.25373021277305 -X8,Bacillota,43.01574040339849 -X8,Bacteroidota,4.094504530639667 -X8,Other (<4.0%),3.6360248531887933 -X9,Pseudomonadota,85.62839981589192 -X9,Bacillota,12.473649123439218 -X9,Other (<4.0%),1.8979510606688494 -``` - -### α-diversity output - -`alpha_div.csv` calculated from 9 kraken2 reports of metagenomic samples using `KrakenParser`: - -``` -Sample,Shannon,Pielou,Chao1 -X1,3.911345447107001,0.5269245043289149,2274.533185840708 -X2,3.9944130792536563,0.4906424221265042,4155.0 -... -X8,3.442077115880119,0.42753293021330063,4177.251358695652 -X9,4.033664950188261,0.5050385978575492,3492.16 -``` - -### β-diversity output - -`beta_div_bray.csv` calculated from 9 kraken2 reports of metagenomic samples using `KrakenParser`: - -``` -,X1,X2,...,X8,X9 -X1,0.0,0.398,...,0.61,0.353 -X2,0.398,0.0,...,0.723,0.388 -... -X8,0.61,0.723,...,0.0,0.665 -X9,0.353,0.388,...,0.665,0.0 -``` - -`beta_div_jaccard.csv` calculated from 9 kraken2 reports of metagenomic samples using `KrakenParser`: +## Installation +```bash +# Linux / WSL / macOS +conda create -n krakenparser pip -y +conda activate krakenparser +pip install krakenparser ``` -,X1,X2,...,X8,X9 -X1,0.0,0.7073170731707317,...,0.8223938223938224,0.7232472324723247 -X2,0.7073170731707317,0.0,...,0.835016835016835,0.7352941176470589 -... -X8,0.8223938223938224,0.835016835016835,...,0.0,0.8066914498141264 -X9,0.7232472324723247,0.7352941176470589,...,0.8066914498141264,0.0 -``` - -### Visualization examples gallery - -|[Stacked Barplot](https://github.com/PopovIILab/KrakenParser/wiki/Stacked-Barplot-API)|[Streamgraph](https://github.com/PopovIILab/KrakenParser/wiki/Streamgraph-API)| -|-------|-------| -|![kpstbar](https://github.com/user-attachments/assets/916b0164-28be-4f49-9634-707408487b85)|![kpstream](https://github.com/user-attachments/assets/8fc09fdb-e397-4c39-9290-ad11da5335a8)| -[Stacked Barplot + Streamgraph](https://github.com/PopovIILab/KrakenParser/wiki/Combined-Stacked-Barplot-&-Streamgraph)|[Clustermap](https://github.com/PopovIILab/KrakenParser/wiki/Clustermap)| -|-------|-------| -|![combined_white](https://github.com/user-attachments/assets/48b3f6e3-6dd5-4298-a793-23dcd549e90c)|![kpclust](https://github.com/user-attachments/assets/98a4d540-7c43-4802-8f77-277a5637a7a1)| +## Usage Guide -## Quick Start (Full Pipeline) +### Full Pipeline ```bash KrakenParser -i data/kreports -o results/ ``` This will: + 1. Convert Kraken2 reports to MPA format 2. Combine MPA files into a single file 3. Extract taxonomic levels into separate text files @@ -126,170 +51,249 @@ This will: > rarefaction depth for β-diversity and re-running relative abundance normalization > before visualization — with ready-to-paste example commands tailored to your output paths. -### Full help output - -``` -usage: KrakenParser [-h] [-i INPUT] [-o OUTPUT] [--viruses] [--keep-human] - [-V] [-d DEPTH] [-s SEED] [--overwrite] - [--step {mpa,combine,split,process,csv,relabund,diversity}] - -KrakenParser: Convert Kraken2 Reports to CSV. - -options: - -h, --help show this help message and exit - -Core Arguments: - -i, --input INPUT Directory containing Kraken2 report files - -o, --output OUTPUT Output directory (default: parent of input) - --viruses Extract only VIRUSES domain taxa in the pipeline - --keep-human Do not filter human-related taxa - -V, --version show program's version number and exit - -Pipeline Options (Full Run): - -d, --depth DEPTH Rarefaction depth for β-diversity (default: 1000) - -s, --seed SEED Random seed for reproducible rarefaction (default: random) - --overwrite Overwrite the output directory if it already exists - -Advanced (Step-by-step control): - --step {mpa,combine,split,process,csv,relabund,diversity} - Run only a specific part of the pipeline. - Type 'krakenparser --step -h' for more. +#### Full help output + +```text +Usage: KrakenParser [OPTIONS] COMMAND [ARGS]... + + KrakenParser: Convert Kraken2 Reports to CSV and analyze microbial diversity. + + To execute the full pipeline automatically, just use the global options. + + Alternatively, you can run specific parts of the pipeline manually in the + following order: + + mpa ➔ combine ➔ split ➔ process ➔ csv ➔ relabund ➔ diversity + + Each step behaves as an independent tool. Type 'krakenparser --help' + to see options for a specific step. + +╭─ Options ────────────────────────────────────────────────────────────────────╮ +│ --input -i PATH Directory containing Kraken2 report files. │ +│ --output -o PATH Output directory. │ +│ --viruses Extract only VIRUSES domain taxa in the │ +│ pipeline. │ +│ --bacteria Extract only BACTERIA domain taxa in the │ +│ pipeline. │ +│ --fungi Extract only FUNGI kingdom taxa in the │ +│ pipeline. │ +│ --archaea Extract only ARCHAEA domain taxa in the │ +│ pipeline. │ +│ --keep-human Do not filter human-related taxa. │ +│ --version -V Show version and exit. │ +│ --depth -d INTEGER Rarefaction depth for β-diversity. │ +│ [default: 1000] │ +│ --seed -s INTEGER Random seed for reproducible rarefaction. │ +│ --overwrite Overwrite the output directory if it already │ +│ exists. │ +│ --help -h Show this message and exit. │ +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Advanced (Step-by-step pipeline control) ───────────────────────────────────╮ +│ mpa Convert a Kraken2 report to MetaPhlAn (MPA) format. │ +│ combine Combine MPA files into a single tab-delimited table. │ +│ split Split a combined MPA table into per-rank TXT files. │ +│ process Reads a source file, processes its first line, modifies taxa │ +│ names in a destination file, and updates it. │ +│ csv Reads a TXT file, reorganizes the data, and converts it into a │ +│ CSV file. │ +│ relabund Calculates taxa relative abundance and saves it to a CSV file. │ +│ diversity Calculate α & β-diversities for microbial communities. │ +╰──────────────────────────────────────────────────────────────────────────────╯ ``` -## Installation - -``` -pip install krakenparser -``` +🔗 Please visit [KrakenParser wiki](https://github.com/PopovIILab/KrakenParser/wiki) page ---- +### Advanced step-by-step mode
-Using Individual Modules (Advanced) +Advanced usage
-Each step of the pipeline can be run individually via `--step`. This is useful for re-running a single step, debugging, or integrating KrakenParser into a custom workflow. Run `krakenparser --step -h` to see the full argument list for any step. +Each step behaves as an independent tool. Type `krakenparser --help` to see options for a specific step. ### **Step 1: Convert Kraken2 Reports to MPA Format** + ```bash # Batch mode (directory) -KrakenParser --step mpa -i data/kreports -o data/intermediate/mpa +KrakenParser mpa -i data/kreports -o data/intermediate/mpa # Single file -KrakenParser --step mpa -r data/kreports/sample.kreport -o data/intermediate/mpa/sample.MPA.TXT +KrakenParser mpa -r data/kreports/sample.kreport -o data/intermediate/mpa/sample.MPA.TXT ``` + Converts Kraken2 `.kreport` files into **MPA format**. ### **Step 2: Combine MPA Files** + ```bash -KrakenParser --step combine -i data/intermediate/mpa/* -o data/intermediate/COMBINED.txt +KrakenParser combine -i data/intermediate/mpa/* -o data/intermediate/COMBINED.txt ``` + Merges multiple MPA files into a single combined table. ### **Step 3: Extract Taxonomic Levels** + ```bash -KrakenParser --step split -i data/intermediate/COMBINED.txt -o data/intermediate +KrakenParser split -i data/intermediate/COMBINED.txt -o data/intermediate ``` By default, human-related taxa (Homo sapiens, Hominidae, Primates, Mammalia, Chordata) are removed. To keep them: + ```bash -KrakenParser --step split -i data/intermediate/COMBINED.txt -o data/intermediate --keep-human +KrakenParser split -i data/intermediate/COMBINED.txt -o data/intermediate --keep-human ``` To inspect the **Viruses** domain only: + ```bash -KrakenParser --step split -i data/intermediate/COMBINED.txt -o data/counts_viruses --viruses-only +KrakenParser split -i data/intermediate/COMBINED.txt -o data/counts_viruses --viruses-only ``` +Same for Bacteria and Archaea domains and Fungi kingdom (`--bacteria-only`; `--archaea-only` & `--fungi-only`) + ### **Step 4: Process Extracted Taxonomic Data** + ```bash -KrakenParser --step process -i data/intermediate/COMBINED.txt -o data/intermediate/txt/counts_phylum.txt +KrakenParser process -i data/intermediate/COMBINED.txt -o data/intermediate/txt/counts_phylum.txt ``` -Repeat on other 5 taxonomical levels (class, order, family, genus, species) or wrap `--step process` in a loop. +Repeat on other 5 taxonomical levels (class, order, family, genus, species) or wrap `process` in a loop. Cleans up taxonomic names: removes prefixes (`s__`, `g__`, etc.) and replaces underscores with spaces. ### **Step 5: Convert TXT to CSV** + ```bash -KrakenParser --step csv -i data/intermediate/txt/counts_phylum.txt -o data/counts/counts_phylum.csv +KrakenParser csv -i data/intermediate/txt/counts_phylum.txt -o data/counts/counts_phylum.csv ``` + Repeat on other 5 taxonomical levels or wrap in a loop. Transposes data so that sample names become rows. ### **Step 6: Calculate Relative Abundance** + ```bash -KrakenParser --step relabund -i data/counts/counts_phylum.csv -o data/rel_abund/ra_phylum.csv +KrakenParser relabund -i data/counts/counts_phylum.csv -o data/rel_abund/ra_phylum.csv ``` + Repeat on other 5 taxonomical levels or wrap in a loop. With "Other" grouping: + ```bash -KrakenParser --step relabund -i data/counts/counts_phylum.csv -o data/rel_abund/ra_phylum.csv -O 3.5 +KrakenParser relabund -i data/counts/counts_phylum.csv -o data/rel_abund/ra_phylum.csv -O 3.5 ``` + Groups all taxa with abundance < 3.5 % into `Other (<3.5%)`. ### **Step 7: Calculate α & β-Diversities** + ```bash -KrakenParser --step diversity -i data/counts/counts_species.csv -o data/diversity +KrakenParser diversity -i data/counts/counts_species.csv -o data/diversity ``` With a custom rarefaction depth: + ```bash -KrakenParser --step diversity -i data/counts/counts_species.csv -o data/diversity -d 750 +KrakenParser diversity -i data/counts/counts_species.csv -o data/diversity -d 750 ``` For reproducible results (fix the seed to get the same matrix every run): + ```bash -KrakenParser --step diversity -i data/counts/counts_species.csv -o data/diversity -s 42 +KrakenParser diversity -i data/counts/counts_species.csv -o data/diversity -s 42 +``` + +
+ +## Output example + +### Total abundance output + +`counts_phylum.csv` parsed from 9 kraken2 reports of metagenomic samples using `KrakenParser`: + +```text +Sample_id,Calditrichota,Caldisericota,Thermosulfidibacterota,Elusimicrobiota,Candidatus Fervidibacterota,Lentisphaerota,Kiritimatiellota,Vulcanimicrobiota,Thermodesulfobiota,Atribacterota,Dictyoglomota,Nitrospinota,Chrysiogenota,Coprothermobacterota,Aquificota,Thermotogota,Bdellovibrionota,Nitrospirota,Deferribacterota,Synergistota,Myxococcota,Acidobacteriota,Candidatus Bipolaricaulota,Candidatus Saccharibacteria,Candidatus Absconditabacteria,Fusobacteriota,Spirochaetota,Candidatus Omnitrophota,Chlamydiota,Verrucomicrobiota,Planctomycetota,Thermodesulfobacteriota,Campylobacterota,Candidatus Cloacimonadota,Fibrobacterota,Gemmatimonadota,Balneolota,Rhodothermota,Ignavibacteriota,Chlorobiota,Bacteroidota,Deinococcota,Thermomicrobiota,Armatimonadota,Chloroflexota,Cyanobacteriota,Mycoplasmatota,Actinomycetota,Bacillota,Pseudomonadota,Heterolobosea,Parabasalia,Fornicata,Evosea,Bacillariophyta,Cercozoa,Euglenozoa,Apicomplexa,Microsporidia,Basidiomycota,Ascomycota,Nanoarchaeota,Candidatus Micrarchaeota,Candidatus Thermoplasmatota,Candidatus Lokiarchaeota,Nitrososphaerota,Euryarchaeota,Thermoproteota,Hofneiviricota,Artverviricota,Nucleocytoviricota,Cossaviricota,Kitrinoviricota,Negarnaviricota,Lenarviricota,Pisuviricota,Peploviricota,Uroviricota +X1,0,0,0,0,0,0,0,0,1,1,1,1,2,3,4,5,7,8,9,17,23,25,5,13,22,47,54,1,6,27,31,128,151,2,6,13,1,3,7,44,14991,7,9,11,61,414,449,3551,55304,438645,0,0,0,0,0,0,1,22,0,4,15,0,0,0,0,0,3,191,0,0,1,88,0,0,0,161,0,1241 +X2,1,4,14,20,5,12,15,6,8,15,2,15,109,68,182,97,79,196,70,272,331,149,36,77,35,562,1237,21,33,129,427,1044,543,8,98,25,16,45,11,1043,41374,160,28,161,1348,1196,2709,15864,431170,2747842,22,7,301,373,134,136,107,3239,54,1151,2905,0,0,3,5,6,7,410,0,0,0,736,0,3,11,26,1,1552 +... +X8,1,19,0,47,0,1,6,20,28,0,1,1,47,7,336,110,30,32,10,93,85,48,9,7,7,154,386,0,14,19,106,358,242,14,5,134,15,11,7,18,54057,106,10,24,212,340,1128,16220,567908,650264,95,4,193,402,314,300,187,4376,37,9796,8653,0,1,0,1,5,23,1778,1,1,0,1,1,4,66,30,4,1263 +X9,0,3,2,16,7,1,23,12,10,9,1,2,134,40,390,289,29,372,27,81,150,90,9,88,32,287,881,14,33,60,319,1045,328,15,22,22,10,72,8,63,35301,127,15,48,412,935,2343,11500,380765,2613854,0,0,0,0,0,0,5,74,0,38,40,3,0,0,0,1,3,275,0,0,0,0,0,2,118,25,0,1675 + +``` + +### Relative abundance output + +`ra_phylum.csv` calculated from 9 kraken2 reports of metagenomic samples using `KrakenParser`: + +```text +Sample_id,taxon,rel_abund_perc +X1,Pseudomonadota,85.03558294577552 +X1,Bacillota,10.72121619814011 +X1,Other (<4.0%),4.243200856084384 +X2,Pseudomonadota,84.28702055549813 +X2,Bacillota,13.225663867469137 +X2,Other (<4.0%),2.487315577032736 +... +X8,Pseudomonadota,49.25373021277305 +X8,Bacillota,43.01574040339849 +X8,Bacteroidota,4.094504530639667 +X8,Other (<4.0%),3.6360248531887933 +X9,Pseudomonadota,85.62839981589192 +X9,Bacillota,12.473649123439218 +X9,Other (<4.0%),1.8979510606688494 ``` ---- +### α-diversity output -## Arguments Breakdown +`alpha_div.csv` calculated from 9 kraken2 reports of metagenomic samples using `KrakenParser`: -### **Full Pipeline** (`-i`) -- `-i / --input`: path to the Kraken2 reports directory (e.g., `data/kreports`). Triggers the full pipeline. -- `-o / --output`: output directory (default: parent of `-i`). -- `--viruses`: extract only Viruses domain taxa throughout the pipeline. -- `--keep-human`: retain human-related taxa (default: filtered out). -- `-d INT / --depth`: rarefaction depth for β-diversity (default: 1000). -- `-s INT / --seed`: random seed for reproducible β-diversity rarefaction (default: random). -- `--overwrite`: overwrite the output directory if it already exists. +```text +Sample,Shannon,Pielou,Chao1 +X1,3.911345447107001,0.5269245043289149,2274.533185840708 +X2,3.9944130792536563,0.4906424221265042,4155.0 +... +X8,3.442077115880119,0.42753293021330063,4177.251358695652 +X9,4.033664950188261,0.5050385978575492,3492.16 +``` -### **--step mpa** (Step 1) -- Batch mode: `-i DIR -o DIR` — converts all files in a directory. -- Single-file mode: `-r FILE -o FILE`. +### β-diversity output -### **--step combine** (Step 2) -- `-i FILE [FILE ...]`: one or more MPA files. -- `-o FILE`: output merged table. +`beta_div_bray.csv` calculated from 9 kraken2 reports of metagenomic samples using `KrakenParser`: -### **--step split** (Step 3) -- Extracts **phylum, class, order, family, genus, species** into separate text files. -- Removes human-related reads by default; use `--keep-human` to retain them. -- Use `--viruses-only` to extract only the Viruses domain. +```text +,X1,X2,...,X8,X9 +X1,0.0,0.398,...,0.61,0.353 +X2,0.398,0.0,...,0.723,0.388 +... +X8,0.61,0.723,...,0.0,0.665 +X9,0.353,0.388,...,0.665,0.0 +``` -### **--step process** (Step 4) -- Removes prefixes (`s__`, `g__`, etc.), replaces underscores with spaces. -- `-i`: COMBINED.txt (source for sample-name header); `-o`: target txt file. +`beta_div_jaccard.csv` calculated from 9 kraken2 reports of metagenomic samples using `KrakenParser`: -### **--step csv** (Step 5) -- Transposes a processed txt file into a CSV with sample names as rows. +```text +,X1,X2,...,X8,X9 +X1,0.0,0.7073170731707317,...,0.8223938223938224,0.7232472324723247 +X2,0.7073170731707317,0.0,...,0.835016835016835,0.7352941176470589 +... +X8,0.8223938223938224,0.835016835016835,...,0.0,0.8066914498141264 +X9,0.7232472324723247,0.7352941176470589,...,0.8066914498141264,0.0 +``` -### **--step relabund** (Step 6) -- Calculates relative abundance from a total-counts CSV. -- `-O FLOAT`: group taxa below FLOAT % into `Other ( +[Stacked Barplot + Streamgraph](https://github.com/PopovIILab/KrakenParser/wiki/Combined-Stacked-Barplot-&-Streamgraph)|[Clustermap](https://github.com/PopovIILab/KrakenParser/wiki/Clustermap)| +|-------|-------| +|![combined_white](https://github.com/user-attachments/assets/48b3f6e3-6dd5-4298-a793-23dcd549e90c)|![kpclust](https://github.com/user-attachments/assets/98a4d540-7c43-4802-8f77-277a5637a7a1)| ## Example Output Structure + After running the full pipeline, the output directory will look like this: -``` + +```text results/ ├─ counts/ # Total abundance CSV output │ ├─ counts_species.csv @@ -319,7 +323,8 @@ results/ ``` ## Conclusion -KrakenParser provides a **simple and automated** way to convert Kraken2 reports into usable CSV files for downstream analysis. You can run the **full pipeline** with a single command or use **individual scripts** as needed. + +KrakenParser provides a **simple and automated** way to convert Kraken2 reports into usable CSV files for downstream analysis. You can run the **full pipeline** with a single command or use **individual modules** as needed. For any issues or feature requests, feel free to open an issue on GitHub! From 58f6bb4069164ef5eedb39b9302b88bf7c717794 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Mon, 1 Jun 2026 20:38:58 +0200 Subject: [PATCH 12/13] test(cli): update test suites to reflect modern typer subcommand syntax --- tests/test_cli.py | 249 ++++++++++++++++++++---------------- tests/test_full_pipeline.py | 10 +- tests/test_integration.py | 74 +++++------ 3 files changed, 182 insertions(+), 151 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 9badb19..f3def0e 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,20 +1,19 @@ -"""Smoke tests for CLI entry-points (main() functions via sys.argv monkeypatching).""" +"""Smoke tests for CLI entry-points via Typer CliRunner.""" import shutil -import sys -import warnings import pandas as pd import pytest +from typer.testing import CliRunner -from krakenparser.counts.convert2csv import main as convert2csv_main -from krakenparser.counts.processing_script import main as processing_main -from krakenparser.counts.split_mpa import main as split_mpa_main -from krakenparser.mpa.mpa_table import main as mpa_table_main -from krakenparser.mpa.transform2mpa import main as transform2mpa_main -from krakenparser.pipeline import main as pipeline_main -from krakenparser.stats.diversity import main as diversity_main -from krakenparser.stats.relabund import main as relabund_main +from krakenparser.counts.convert2csv import app as convert2csv_app +from krakenparser.counts.processing_script import app as processing_app +from krakenparser.counts.split_mpa import app as split_mpa_app +from krakenparser.mpa.mpa_table import app as mpa_table_app +from krakenparser.mpa.transform2mpa import app as transform2mpa_app +from krakenparser.pipeline import app as pipeline_app +from krakenparser.stats.diversity import app as diversity_app +from krakenparser.stats.relabund import app as relabund_app _MPA_A = "#Classification\tsample1\nd__Bacteria|s__Pseudomonas_aeruginosa\t300\n" _MPA_B = "#Classification\tsample2\nd__Bacteria|s__Pseudomonas_aeruginosa\t100\n" @@ -26,18 +25,35 @@ ) +@pytest.fixture +def runner(): + return CliRunner() + + # --------------------------------------------------------------------------- # convert2csv # --------------------------------------------------------------------------- -def test_convert2csv_main(counts_txt_file, tmp_path, monkeypatch): - out = tmp_path / "out.csv" - monkeypatch.setattr( - sys, "argv", ["c2c", "-i", str(counts_txt_file), "-o", str(out)] +def test_convert2csv_no_args_shows_help(runner): + result = runner.invoke(convert2csv_app, []) + assert result.exit_code == 0 + assert "Usage" in result.output + + +def test_convert2csv_missing_one_option(runner, tmp_path): + result = runner.invoke(convert2csv_app, ["-i", str(tmp_path / "x.txt")]) + assert result.exit_code == 1 + assert "Missing required options" in result.output + + +def test_convert2csv_file_not_found(runner, tmp_path): + result = runner.invoke( + convert2csv_app, + ["-i", str(tmp_path / "ghost.txt"), "-o", str(tmp_path / "out.csv")], ) - convert2csv_main() - assert out.exists() + assert result.exit_code == 1 + assert "Error" in result.output # --------------------------------------------------------------------------- @@ -45,13 +61,25 @@ def test_convert2csv_main(counts_txt_file, tmp_path, monkeypatch): # --------------------------------------------------------------------------- -def test_processing_main(tmp_path, monkeypatch): - source = tmp_path / "COMBINED.txt" - source.write_text("#Classification\tsample1.kreport\n") - dest = tmp_path / "counts.txt" - dest.write_text("s__Pseudomonas_aeruginosa\t100\n") - monkeypatch.setattr(sys, "argv", ["ps", "-i", str(source), "-o", str(dest)]) - processing_main() +def test_processing_no_args_shows_help(runner): + result = runner.invoke(processing_app, []) + assert result.exit_code == 0 + assert "Usage" in result.output + + +def test_processing_missing_one_option(runner, tmp_path): + result = runner.invoke(processing_app, ["-i", str(tmp_path / "x.txt")]) + assert result.exit_code == 1 + assert "Missing required options" in result.output + + +def test_processing_file_not_found(runner, tmp_path): + result = runner.invoke( + processing_app, + ["-i", str(tmp_path / "ghost.txt"), "-o", str(tmp_path / "dest.txt")], + ) + assert result.exit_code == 1 + assert "Error" in result.output # --------------------------------------------------------------------------- @@ -59,33 +87,25 @@ def test_processing_main(tmp_path, monkeypatch): # --------------------------------------------------------------------------- -def test_split_mpa_main(tmp_path, monkeypatch): - combined = tmp_path / "COMBINED.txt" - combined.write_text(_COMBINED_MPA) - out = tmp_path / "out" - monkeypatch.setattr(sys, "argv", ["sm", "-i", str(combined), "-o", str(out)]) - split_mpa_main() - assert (out / "txt" / "counts_species.txt").exists() +def test_split_mpa_no_args_shows_help(runner): + result = runner.invoke(split_mpa_app, []) + assert result.exit_code == 0 + assert "Usage" in result.output -def test_split_mpa_main_viruses_only(tmp_path, monkeypatch): - combined = tmp_path / "COMBINED.txt" - combined.write_text(_COMBINED_MPA + "d__Viruses|s__Virus_X\t5\t3\n") - out = tmp_path / "out" - monkeypatch.setattr( - sys, "argv", ["sm", "-i", str(combined), "-o", str(out), "--viruses-only"] - ) - split_mpa_main() +def test_split_mpa_missing_one_option(runner, tmp_path): + result = runner.invoke(split_mpa_app, ["-i", str(tmp_path / "x.txt")]) + assert result.exit_code == 1 + assert "Missing required options" in result.output -def test_split_mpa_main_keep_human(tmp_path, monkeypatch): - combined = tmp_path / "COMBINED.txt" - combined.write_text(_COMBINED_MPA) - out = tmp_path / "out" - monkeypatch.setattr( - sys, "argv", ["sm", "-i", str(combined), "-o", str(out), "--keep-human"] +def test_split_mpa_file_not_found(runner, tmp_path): + result = runner.invoke( + split_mpa_app, + ["-i", str(tmp_path / "ghost.txt"), "-o", str(tmp_path / "out")], ) - split_mpa_main() + assert result.exit_code == 1 + assert "Error" in result.output # --------------------------------------------------------------------------- @@ -93,13 +113,14 @@ def test_split_mpa_main_keep_human(tmp_path, monkeypatch): # --------------------------------------------------------------------------- -def test_mpa_table_main(tmp_path, monkeypatch): +def test_mpa_table_main(tmp_path, runner): a, b = tmp_path / "a.MPA.TXT", tmp_path / "b.MPA.TXT" a.write_text(_MPA_A) b.write_text(_MPA_B) out = tmp_path / "COMBINED.txt" - monkeypatch.setattr(sys, "argv", ["mt", "-i", str(a), str(b), "-o", str(out)]) - mpa_table_main() + + result = runner.invoke(mpa_table_app, ["-i", str(a), "-i", str(b), "-o", str(out)]) + assert result.exit_code == 0 assert out.exists() @@ -108,22 +129,23 @@ def test_mpa_table_main(tmp_path, monkeypatch): # --------------------------------------------------------------------------- -def test_transform2mpa_main_single(kreport_file, tmp_path, monkeypatch): +def test_transform2mpa_main_single(kreport_file, tmp_path, runner): out = tmp_path / "out.MPA.TXT" - monkeypatch.setattr(sys, "argv", ["t2m", "-r", str(kreport_file), "-o", str(out)]) - transform2mpa_main() + result = runner.invoke(transform2mpa_app, ["-r", str(kreport_file), "-o", str(out)]) + assert result.exit_code == 0 assert out.exists() -def test_transform2mpa_main_batch(kreport_file, tmp_path, monkeypatch): +def test_transform2mpa_main_batch(kreport_file, tmp_path, runner): kreports_dir = tmp_path / "kreports" kreports_dir.mkdir() shutil.copy(kreport_file, kreports_dir / kreport_file.name) out_dir = tmp_path / "mpa_out" - monkeypatch.setattr( - sys, "argv", ["t2m", "-i", str(kreports_dir), "-o", str(out_dir)] + + result = runner.invoke( + transform2mpa_app, ["-i", str(kreports_dir), "-o", str(out_dir)] ) - transform2mpa_main() + assert result.exit_code == 0 assert out_dir.is_dir() @@ -132,35 +154,37 @@ def test_transform2mpa_main_batch(kreport_file, tmp_path, monkeypatch): # --------------------------------------------------------------------------- -def test_diversity_main_with_seed(counts_csv_file, tmp_path, monkeypatch): - out_dir = tmp_path / "div" - monkeypatch.setattr( - sys, - "argv", - [ - "div", - "-i", - str(counts_csv_file), - "-o", - str(out_dir), - "-d", - "1000", - "-s", - "42", - ], +def test_diversity_no_args_shows_help(runner): + result = runner.invoke(diversity_app, []) + assert result.exit_code == 0 + assert "Usage" in result.output + + +def test_diversity_missing_one_option(runner, tmp_path): + result = runner.invoke(diversity_app, ["-o", str(tmp_path / "out")]) + assert result.exit_code == 1 + assert "Missing required options" in result.output + + +def test_diversity_file_not_found(runner, tmp_path): + result = runner.invoke( + diversity_app, + ["-i", str(tmp_path / "ghost.csv"), "-o", str(tmp_path / "out")], ) - diversity_main() - assert (out_dir / "alpha_div.csv").exists() + assert result.exit_code == 1 + assert "Error" in result.output -def test_diversity_main_no_seed(counts_csv_file, tmp_path, monkeypatch): +def test_diversity_not_enough_samples_for_beta(runner, tmp_path): + csv_in = tmp_path / "single.csv" + pd.DataFrame({"Taxon_A": [100], "Taxon_B": [200]}, index=["S1"]).to_csv(csv_in) out_dir = tmp_path / "div" - monkeypatch.setattr( - sys, - "argv", - ["div", "-i", str(counts_csv_file), "-o", str(out_dir), "-d", "1000"], + result = runner.invoke( + diversity_app, + ["-i", str(csv_in), "-o", str(out_dir), "-d", "50"], ) - diversity_main() + assert result.exit_code == 1 + assert "Error" in result.output # --------------------------------------------------------------------------- @@ -168,34 +192,25 @@ def test_diversity_main_no_seed(counts_csv_file, tmp_path, monkeypatch): # --------------------------------------------------------------------------- -def test_relabund_main(counts_csv_file, tmp_path, monkeypatch): - out = tmp_path / "ra.csv" - monkeypatch.setattr(sys, "argv", ["ra", "-i", str(counts_csv_file), "-o", str(out)]) - relabund_main() - assert out.exists() +def test_relabund_no_args_shows_help(runner): + result = runner.invoke(relabund_app, []) + assert result.exit_code == 0 + assert "Usage" in result.output -def test_relabund_main_with_other_threshold(counts_csv_file, tmp_path, monkeypatch): - out = tmp_path / "ra.csv" - monkeypatch.setattr( - sys, "argv", ["ra", "-i", str(counts_csv_file), "-o", str(out), "-O", "50"] - ) - relabund_main() +def test_relabund_missing_one_option(runner, tmp_path): + result = runner.invoke(relabund_app, ["-i", str(tmp_path / "x.csv")]) + assert result.exit_code == 1 + assert "Missing required options" in result.output -def test_relabund_warns_zero_abundance_sample(tmp_path): - df = pd.DataFrame( - {"Sample_id": ["S1", "S2"], "Taxon_A": [0, 100], "Taxon_B": [0, 200]} +def test_relabund_file_not_found(runner, tmp_path): + result = runner.invoke( + relabund_app, + ["-i", str(tmp_path / "ghost.csv"), "-o", str(tmp_path / "out.csv")], ) - csv_in = tmp_path / "counts.csv" - df.to_csv(csv_in, index=False) - out = tmp_path / "ra.csv" - with warnings.catch_warnings(record=True) as caught: - warnings.simplefilter("always") - from krakenparser.stats.relabund import calculate_rel_abund - - calculate_rel_abund(str(csv_in), str(out)) - assert any("zero total abundance" in str(w.message) for w in caught) + assert result.exit_code == 1 + assert "Error" in result.output # --------------------------------------------------------------------------- @@ -203,7 +218,27 @@ def test_relabund_warns_zero_abundance_sample(tmp_path): # --------------------------------------------------------------------------- -def test_pipeline_main_missing_input_exits(tmp_path, monkeypatch): - monkeypatch.setattr(sys, "argv", ["pipeline", "-i", str(tmp_path / "ghost")]) - with pytest.raises(SystemExit): - pipeline_main() +def test_pipeline_no_mpa_files(runner, tmp_path): + empty_dir = tmp_path / "kreports" + empty_dir.mkdir() + result = runner.invoke(pipeline_app, ["-i", str(empty_dir)]) + assert result.exit_code == 1 + assert "Error" in result.output + + +def test_pipeline_file_exists_error(runner, tmp_path, kreport_file): + kreports_dir = tmp_path / "kreports" + kreports_dir.mkdir() + shutil.copy(kreport_file, kreports_dir / kreport_file.name) + + runner.invoke(pipeline_app, ["-i", str(kreports_dir), "--overwrite"]) + + result = runner.invoke(pipeline_app, ["-i", str(kreports_dir)]) + assert result.exit_code == 1 + assert "Error" in result.output + + +def test_pipeline_missing_input_dir(runner, tmp_path): + result = runner.invoke(pipeline_app, ["-i", str(tmp_path / "ghost")]) + assert result.exit_code == 1 + assert "Error" in result.output diff --git a/tests/test_full_pipeline.py b/tests/test_full_pipeline.py index 72df80f..ad2dba6 100644 --- a/tests/test_full_pipeline.py +++ b/tests/test_full_pipeline.py @@ -31,7 +31,7 @@ def test_full_pipeline_end_to_end(demo_run): run_dir = demo_run["run_dir"] kreports_path = run_dir / "kreports" - run_pipeline(str(kreports_path)) + run_pipeline(kreports_path) # Assert each rank-level CSV exists and is non-empty ranks = ["phylum", "class", "order", "family", "genus", "species"] @@ -59,17 +59,17 @@ def test_pipeline_overwrite_protection(demo_run): run_dir = demo_run["run_dir"] kreports_path = run_dir / "kreports" - run_pipeline(str(kreports_path)) + run_pipeline(kreports_path) # Second run without --overwrite must raise (library function, not sys.exit) with pytest.raises(FileExistsError): - run_pipeline(str(kreports_path)) + run_pipeline(kreports_path) def test_pipeline_overwrite_flag(demo_run): run_dir = demo_run["run_dir"] kreports_path = run_dir / "kreports" - run_pipeline(str(kreports_path)) + run_pipeline(kreports_path) # Second run with overwrite=True must succeed - run_pipeline(str(kreports_path), overwrite=True) + run_pipeline(kreports_path, overwrite=True) diff --git a/tests/test_integration.py b/tests/test_integration.py index 3fe61e6..4dc9511 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -2,6 +2,7 @@ import hashlib import itertools +from pathlib import Path import pandas as pd import pytest @@ -39,7 +40,7 @@ def combined_mpa_file(tmp_path): # --------------------------------------------------------------------------- -def _sha256(path) -> str: +def _sha256(path: Path) -> str: return hashlib.sha256(path.read_bytes()).hexdigest() @@ -53,7 +54,7 @@ def test_kreport_to_mpa_reproducible(kreport_file, tmp_path): def run(): out = tmp_path / f"out_{next(counter)}.MPA.TXT" - kreport_to_mpa(str(kreport_file), str(out), display_header=True) + kreport_to_mpa(kreport_file, out, display_header=True) return _sha256(out) assert run() == run() @@ -61,7 +62,7 @@ def run(): def test_kreport_to_mpa_standard_ranks_only(kreport_file, tmp_path): out = tmp_path / "out.MPA.TXT" - kreport_to_mpa(str(kreport_file), str(out)) + kreport_to_mpa(kreport_file, out) lines = out.read_text().splitlines() paths = [ln.split("\t")[0] for ln in lines] @@ -72,7 +73,7 @@ def test_kreport_to_mpa_standard_ranks_only(kreport_file, tmp_path): def test_kreport_to_mpa_excludes_unclassified_and_root(kreport_file, tmp_path): out = tmp_path / "out.MPA.TXT" - kreport_to_mpa(str(kreport_file), str(out)) + kreport_to_mpa(kreport_file, out) content = out.read_text() assert "unclassified" not in content assert "root" not in content @@ -80,7 +81,7 @@ def test_kreport_to_mpa_excludes_unclassified_and_root(kreport_file, tmp_path): def test_kreport_to_mpa_display_header(kreport_file, tmp_path): out = tmp_path / "out.MPA.TXT" - kreport_to_mpa(str(kreport_file), str(out), display_header=True) + kreport_to_mpa(kreport_file, out, display_header=True) first_line = out.read_text().splitlines()[0] assert first_line.startswith("#Classification") assert kreport_file.name in first_line @@ -88,12 +89,11 @@ def test_kreport_to_mpa_display_header(kreport_file, tmp_path): def test_kreport_to_mpa_paths_are_hierarchical(kreport_file, tmp_path): out = tmp_path / "out.MPA.TXT" - kreport_to_mpa(str(kreport_file), str(out)) + kreport_to_mpa(kreport_file, out) lines = out.read_text().splitlines() for ln in lines: path = ln.split("\t")[0] segments = path.split("|") - # Each segment must have a known two-letter prefix for seg in segments: assert "__" in seg, f"Unexpected MPA segment format: {seg!r}" @@ -108,7 +108,7 @@ def test_convert_to_csv_reproducible(counts_txt_file, tmp_path): def run(): out = tmp_path / f"out_{next(counter)}.csv" - convert_to_csv(str(counts_txt_file), str(out)) + convert_to_csv(counts_txt_file, out) return _sha256(out) assert run() == run() @@ -116,7 +116,7 @@ def run(): def test_convert_to_csv_transposes_correctly(counts_txt_file, tmp_path): out = tmp_path / "counts.csv" - convert_to_csv(str(counts_txt_file), str(out)) + convert_to_csv(counts_txt_file, out) df = pd.read_csv(out) assert "Sample_id" in df.columns @@ -139,7 +139,7 @@ def test_process_files_adds_header_and_cleans_names(tmp_path): dest.write_text( "s__Pseudomonas_aeruginosa\t300\t100\ns__Escherichia_coli\t200\t50\n" ) - process_files(str(source), str(dest)) + process_files(source, dest) result = dest.read_text() lines = result.splitlines() assert lines[0] == "#Classification\tsample1\tsample2" @@ -154,7 +154,7 @@ def test_process_files_reproducible(tmp_path): for i in range(2): dest = tmp_path / f"counts_{i}.txt" dest.write_text("s__Some_species\t10\n") - process_files(str(source), str(dest)) + process_files(source, dest) assert (tmp_path / "counts_0.txt").read_text() == ( tmp_path / "counts_1.txt" ).read_text() @@ -164,19 +164,19 @@ def test_process_files_missing_source_raises(tmp_path): dest = tmp_path / "counts.txt" dest.write_text("s__X\t10\n") with pytest.raises(FileNotFoundError): - process_files(str(tmp_path / "ghost.txt"), str(dest)) + process_files(tmp_path / "ghost.txt", dest) def test_process_files_missing_dest_raises(tmp_path): source = tmp_path / "COMBINED.txt" source.write_text("#Classification\tsample1.kreport\n") with pytest.raises(FileNotFoundError): - process_files(str(source), str(tmp_path / "ghost.txt")) + process_files(source, tmp_path / "ghost.txt") def test_convert_to_csv_missing_input_raises(tmp_path): with pytest.raises(FileNotFoundError): - convert_to_csv(str(tmp_path / "ghost.txt"), str(tmp_path / "out.csv")) + convert_to_csv(tmp_path / "ghost.txt", tmp_path / "out.csv") # --------------------------------------------------------------------------- @@ -189,7 +189,7 @@ def test_relabund_reproducible(counts_csv_file, tmp_path): def run(): out = tmp_path / f"ra_{next(counter)}.csv" - calculate_rel_abund(str(counts_csv_file), str(out)) + calculate_rel_abund(counts_csv_file, out) return _sha256(out) assert run() == run() @@ -197,7 +197,7 @@ def run(): def test_relabund_sums_to_100_per_sample(counts_csv_file, tmp_path): out = tmp_path / "ra.csv" - calculate_rel_abund(str(counts_csv_file), str(out)) + calculate_rel_abund(counts_csv_file, out) df = pd.read_csv(out) for sample, grp in df.groupby("Sample_id"): total = grp["rel_abund_perc"].sum() @@ -206,21 +206,21 @@ def test_relabund_sums_to_100_per_sample(counts_csv_file, tmp_path): def test_relabund_other_threshold_creates_other_group(counts_csv_file, tmp_path): out = tmp_path / "ra.csv" - calculate_rel_abund(str(counts_csv_file), str(out), other_threshold=99.0) + calculate_rel_abund(counts_csv_file, out, other_threshold=99.0) df = pd.read_csv(out) assert df["taxon"].str.startswith("Other").any() def test_relabund_no_zero_rows(counts_csv_file, tmp_path): out = tmp_path / "ra.csv" - calculate_rel_abund(str(counts_csv_file), str(out)) + calculate_rel_abund(counts_csv_file, out) df = pd.read_csv(out) assert (df["rel_abund_perc"] > 0).all() def test_relabund_missing_input_raises(tmp_path): with pytest.raises(FileNotFoundError): - calculate_rel_abund(str(tmp_path / "ghost.csv"), str(tmp_path / "out.csv")) + calculate_rel_abund(tmp_path / "ghost.csv", tmp_path / "out.csv") # --------------------------------------------------------------------------- @@ -308,7 +308,7 @@ def test_beta_div_too_few_samples_raises(tmp_path): def test_split_mpa_creates_all_rank_files(combined_mpa_file, tmp_path): - split_mpa(str(combined_mpa_file), str(tmp_path)) + split_mpa(combined_mpa_file, tmp_path) for rank in ("species", "genus", "family", "order", "class", "phylum"): assert (tmp_path / "txt" / f"counts_{rank}.txt").exists() @@ -318,14 +318,14 @@ def test_split_mpa_reproducible(combined_mpa_file, tmp_path): def run(): out = tmp_path / f"out_{next(counter)}" - split_mpa(str(combined_mpa_file), str(out)) + split_mpa(combined_mpa_file, out) return _sha256(out / "txt" / "counts_species.txt") assert run() == run() def test_split_mpa_filters_human_by_default(combined_mpa_file, tmp_path): - split_mpa(str(combined_mpa_file), str(tmp_path)) + split_mpa(combined_mpa_file, tmp_path) species = (tmp_path / "txt" / "counts_species.txt").read_text() assert "Homo_sapiens" not in species genus = (tmp_path / "txt" / "counts_genus.txt").read_text() @@ -333,36 +333,34 @@ def test_split_mpa_filters_human_by_default(combined_mpa_file, tmp_path): def test_split_mpa_keep_human_retains_homo(combined_mpa_file, tmp_path): - split_mpa(str(combined_mpa_file), str(tmp_path), keep_human=True) + split_mpa(combined_mpa_file, tmp_path, keep_human=True) species = (tmp_path / "txt" / "counts_species.txt").read_text() assert "Homo_sapiens" in species def test_split_mpa_viruses_only(combined_mpa_file, tmp_path): - split_mpa(str(combined_mpa_file), str(tmp_path), viruses_only=True) + split_mpa(combined_mpa_file, tmp_path, viruses_only=True) species = (tmp_path / "txt" / "counts_species.txt").read_text() assert "Virus_alpha" in species assert "Pseudomonas_aeruginosa" not in species def test_split_mpa_strips_path_prefix(combined_mpa_file, tmp_path): - split_mpa(str(combined_mpa_file), str(tmp_path)) + split_mpa(combined_mpa_file, tmp_path) species = (tmp_path / "txt" / "counts_species.txt").read_text() - # No MPA path separators should remain assert "|" not in species - # The rank prefix should remain (processing_script strips it later) assert "s__" in species def test_split_mpa_genus_excludes_species_lines(combined_mpa_file, tmp_path): - split_mpa(str(combined_mpa_file), str(tmp_path)) + split_mpa(combined_mpa_file, tmp_path) genus = (tmp_path / "txt" / "counts_genus.txt").read_text() assert "s__" not in genus def test_split_mpa_missing_input_raises(tmp_path): with pytest.raises(FileNotFoundError): - split_mpa(str(tmp_path / "ghost.txt"), str(tmp_path / "out")) + split_mpa(tmp_path / "ghost.txt", tmp_path / "out") # --------------------------------------------------------------------------- @@ -372,24 +370,24 @@ def test_split_mpa_missing_input_raises(tmp_path): def test_kreport_to_mpa_creates_output_dir(kreport_file, tmp_path): out = tmp_path / "new_subdir" / "out.MPA.TXT" - kreport_to_mpa(str(kreport_file), str(out)) + kreport_to_mpa(kreport_file, out) assert out.exists() def test_kreport_to_mpa_missing_input_raises(tmp_path): with pytest.raises(FileNotFoundError): - kreport_to_mpa(str(tmp_path / "ghost.kreport"), str(tmp_path / "out.MPA.TXT")) + kreport_to_mpa(tmp_path / "ghost.kreport", tmp_path / "out.MPA.TXT") def test_convert_to_csv_creates_output_dir(counts_txt_file, tmp_path): out = tmp_path / "new_subdir" / "counts.csv" - convert_to_csv(str(counts_txt_file), str(out)) + convert_to_csv(counts_txt_file, out) assert out.exists() def test_relabund_creates_output_dir(counts_csv_file, tmp_path): out = tmp_path / "new_subdir" / "ra.csv" - calculate_rel_abund(str(counts_csv_file), str(out)) + calculate_rel_abund(counts_csv_file, out) assert out.exists() @@ -421,7 +419,7 @@ def test_combine_mpa_creates_output_dir(tmp_path): a.write_text(SAMPLE_MPA_A) b.write_text(SAMPLE_MPA_B) out = tmp_path / "new_subdir" / "COMBINED.txt" - combine_mpa([str(a), str(b)], str(out)) + combine_mpa([a, b], out) assert out.exists() @@ -429,9 +427,7 @@ def test_combine_mpa_missing_input_raises(tmp_path): existing = tmp_path / "a.MPA.TXT" existing.write_text(SAMPLE_MPA_A) with pytest.raises(FileNotFoundError): - combine_mpa( - [str(existing), str(tmp_path / "ghost.MPA.TXT")], str(tmp_path / "out.txt") - ) + combine_mpa([existing, tmp_path / "ghost.MPA.TXT"], tmp_path / "out.txt") # --------------------------------------------------------------------------- @@ -443,7 +439,7 @@ def test_process_files_missing_dest_still_raises(tmp_path): source = tmp_path / "COMBINED.txt" source.write_text("#Classification\tsample1.kreport\n") with pytest.raises(FileNotFoundError): - process_files(str(source), str(tmp_path / "nonexistent.txt")) + process_files(source, tmp_path / "nonexistent.txt") # --------------------------------------------------------------------------- @@ -458,7 +454,7 @@ def test_split_mpa_filters_terminal_rank_nodes(tmp_path): "d__Bacteria|p__Pseudomonadota|s__Pseudomonas_aeruginosa\t300\n" "d__Bacteria|p__Pseudomonadota|s__Pseudomonas_aeruginosa|t__strain_X\t10\n" ) - split_mpa(str(combined), str(tmp_path / "out")) + split_mpa(combined, tmp_path / "out") species = (tmp_path / "out" / "txt" / "counts_species.txt").read_text() assert "t__" not in species From 36adecda9119dcf3dc0dcfc05073cbcc490669bf Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Mon, 1 Jun 2026 20:50:58 +0200 Subject: [PATCH 13/13] chore: remove obsolete comments from scripts --- krakenparser/counts/convert2csv.py | 2 +- krakenparser/mpa/mpa_table.py | 4 ++-- krakenparser/stats/relabund.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/krakenparser/counts/convert2csv.py b/krakenparser/counts/convert2csv.py index 4b484b1..d20086c 100755 --- a/krakenparser/counts/convert2csv.py +++ b/krakenparser/counts/convert2csv.py @@ -31,7 +31,7 @@ def convert_to_csv(input_file: str, output_file: str) -> None: @app.callback(invoke_without_command=True) def main( - ctx: typer.Context, # Контекст для нативного хелпа + ctx: typer.Context, input_file: Optional[str] = typer.Option( None, "-i", diff --git a/krakenparser/mpa/mpa_table.py b/krakenparser/mpa/mpa_table.py index f4a13dc..2c30e79 100644 --- a/krakenparser/mpa/mpa_table.py +++ b/krakenparser/mpa/mpa_table.py @@ -57,7 +57,7 @@ def combine_mpa(in_files: list[Path], o_file: Path) -> None: split_vals = classification.split("|") curr_parent = "" for i in range(len(split_vals)): - test_val = "|".join(split_vals[:i]) # при i=0 → "" + test_val = "|".join(split_vals[:i]) if test_val in values: curr_parent = test_val @@ -102,7 +102,7 @@ def combine_mpa(in_files: list[Path], o_file: Path) -> None: @app.callback(invoke_without_command=True) def main( - ctx: typer.Context, # Контекст для работы с хелпом + ctx: typer.Context, in_files: Optional[list[Path]] = typer.Option( None, "-i", diff --git a/krakenparser/stats/relabund.py b/krakenparser/stats/relabund.py index ee924d9..9439f44 100644 --- a/krakenparser/stats/relabund.py +++ b/krakenparser/stats/relabund.py @@ -75,7 +75,7 @@ def calculate_rel_abund( @app.callback(invoke_without_command=True) def main( - ctx: typer.Context, # Контекст для вызова хелпа + ctx: typer.Context, input_file: Optional[Path] = typer.Option( None, "-i",