From 58c25c04e48e32dcefb294eea26699fddad331e5 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Thu, 4 Jun 2026 09:27:52 +0200 Subject: [PATCH 01/25] docs(cli): add type annotations, docstrings, and comments to krakenparser entrypoint --- krakenparser/krakenparser.py | 49 +++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/krakenparser/krakenparser.py b/krakenparser/krakenparser.py index db3d951..c755fcb 100755 --- a/krakenparser/krakenparser.py +++ b/krakenparser/krakenparser.py @@ -1,8 +1,9 @@ -#!/usr/bin/env python -"""KrakenParser: Convert Kraken2 Reports to CSV and analyze microbial diversity. +#!/usr/bin/env python3 +"""Main command-line interface entry point for KrakenParser. -Built with native Typer subcommands while preserving a direct root interface -for the full pipeline execution without the 'run' keyword. +This module orchestrates the entire KrakenParser suite, exposing an end-to-end +automated pipeline alongside granular subcommands for step-by-step control +over metagenomic report parsing, normalization, and statistical analysis. """ import logging @@ -23,18 +24,21 @@ from krakenparser.stats.diversity import app as diversity_app from krakenparser.stats.relabund import app as relabund_app +# Fetch package version dynamically from metadata or fall back to unknown try: - __version__ = _pkg_version("krakenparser") + __version__: str = _pkg_version("krakenparser") except _PNF: __version__ = "unknown" -app = typer.Typer( +# Initialize primary Typer interface with global configuration +app: typer.Typer = typer.Typer( add_completion=False, context_settings={"help_option_names": ["-h", "--help"]}, ) -PANEL_NAME = "Advanced (Step-by-step pipeline control)" +PANEL_NAME: str = "Advanced (Step-by-step pipeline control)" +# Register individual step subcommands under a isolated help panel app.add_typer(mpa_app, name="mpa", rich_help_panel=PANEL_NAME) app.add_typer(combine_app, name="combine", rich_help_panel=PANEL_NAME) app.add_typer(split_app, name="split", rich_help_panel=PANEL_NAME) @@ -45,6 +49,14 @@ def _version_callback(value: bool) -> None: + """Eager callback executing the version flag logic. + + Args: + value: Boolean trigger provided by the Typer parameter evaluation. + + Raises: + typer.Exit: Gracefully terminates the runtime execution upon displaying version. + """ if value: print(f"KrakenParser {__version__}") raise typer.Exit() @@ -118,18 +130,20 @@ def main_callback( Each step behaves as an independent tool. Type 'krakenparser --help' to see options for a specific step. """ - + # Prevent execution loop if the engine passes control down to registered subcommands if ctx.invoked_subcommand is not None: return + # Execute monolithic end-to-end automation workflow if input targets are declared if input_dir: print("KrakenParser by Ilia V. Popov") - out_path = output_dir if output_dir else input_dir.parent + out_path: Path = output_dir if output_dir else input_dir.parent out_path.mkdir(parents=True, exist_ok=True) - log_file_path = out_path / "krakenparser.log" + log_file_path: Path = out_path / "krakenparser.log" - log_handler = logging.FileHandler(log_file_path, mode="w") + # Dynamically attach logging engine dedicated to current run output context + log_handler: logging.FileHandler = logging.FileHandler(log_file_path, mode="w") log_handler.setFormatter(logging.Formatter("%(message)s")) logging.basicConfig(level=logging.INFO, handlers=[log_handler]) @@ -153,12 +167,13 @@ def main_callback( print("All steps completed successfully!") print(f"Logs saved to {log_file_path}") - out_str = out_path.as_posix() + out_str: str = out_path.as_posix() - has_custom_depth = ( + # Interrogate parameter sources to tailor downstream recommendations accurately + has_custom_depth: bool = ( str(ctx.get_parameter_source("depth")) != "ParameterSource.DEFAULT" ) - has_custom_seed = ( + has_custom_seed: bool = ( str(ctx.get_parameter_source("seed")) != "ParameterSource.DEFAULT" ) @@ -196,11 +211,17 @@ def main_callback( raise typer.Exit() + # Fallback default interaction pattern rendering global help usage diagnostics print("KrakenParser by Ilia V. Popov") print(ctx.get_help()) def entry_point() -> None: + """Consolidated main system application executor wrapper. + + Handles external runtime events like manual cancellation securely to protect + terminal trace sanity. + """ try: app() except KeyboardInterrupt: From fa0a081bcbbe3221a6cf20fd93cad7f79a62a72b Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Thu, 4 Jun 2026 09:29:12 +0200 Subject: [PATCH 02/25] refactor(pipeline): harden pipeline orchestration with static type hints and docstrings --- krakenparser/pipeline.py | 153 ++++++++++++++++++++++++++++++--------- 1 file changed, 118 insertions(+), 35 deletions(-) diff --git a/krakenparser/pipeline.py b/krakenparser/pipeline.py index 5a3a5aa..9c28c01 100644 --- a/krakenparser/pipeline.py +++ b/krakenparser/pipeline.py @@ -1,5 +1,10 @@ -#!/usr/bin/env python -"""Full KrakenParser pipeline: kreport → MPA → combined → counts → rel_abund → diversity.""" +#!/usr/bin/env python3 +"""Core orchestration engine for the KrakenParser execution pipeline. + +This module consolidates independent taxonomic processing steps into a seamless, +end-to-end automated pipeline. It handles file validation, directory structure +sanitization, data transformations, statistical scaling, and diversity indexing. +""" import logging import shutil @@ -18,30 +23,52 @@ from krakenparser.stats.diversity import calc_alpha_div, calc_beta_div from krakenparser.stats.relabund import calculate_rel_abund -_log = logging.getLogger(__name__) +# Initialize module-level isolated logger +_log: logging.Logger = logging.getLogger(__name__) -app = typer.Typer( +# Primary pipeline subcommand routing sub-app +app: typer.Typer = typer.Typer( name="run", add_completion=False, context_settings={"help_option_names": ["-h", "--help"]}, ) +# Structural database directory map enforced by the pipeline architecture +_OUTPUT_SUBDIRS: tuple[str, ...] = ("intermediate", "counts", "rel_abund", "diversity") + + +def _is_processable(filepath: Path) -> bool: + """Validate file integrity prior to feeding it to text-parsing utilities. -def _is_processable(path: Path) -> bool: - """Return False for hidden files, files with null bytes, or non-UTF-8 files.""" - if path.name.startswith("."): + Ensures the target file is not a hidden system artifact, does not contain + binary null-byte contamination, and safely decodes via UTF-8 standard. + + Args: + path: A Path object referencing the target text file. + + Returns: + bool: True if the file matches structural sanity baselines, False otherwise. + """ + if filepath.name.startswith("."): return False + try: - chunk = path.read_bytes()[:1024] - if b"\x00" in chunk: + if b"\x00" in filepath.read_bytes(): return False - chunk.decode("utf-8") - return True - except (UnicodeDecodeError, OSError): + except Exception: return False - -_OUTPUT_SUBDIRS = ("intermediate", "counts", "rel_abund", "diversity") + # 3. СТРОГАЯ проверка на UTF-8 (вот здесь косяк) + try: + # Добавляем errors="strict", чтобы не глотать BOM и левые кодировки + with open(filepath, "r", encoding="utf-8", errors="strict") as f: + # Читаем небольшой кусок файла для проверки + f.read(1024) + return True + except UnicodeDecodeError: + return False + except Exception: + return False def run_pipeline( @@ -56,16 +83,43 @@ def run_pipeline( seed: Optional[int] = None, overwrite: bool = False, ) -> None: - source_dir = input_dir + """Execute the sequential programmatic workflow blocks of KrakenParser. + + Validates resources, purges legacy directories under explicit overwrite rules, + converts raw reports to structured matrices, strips structural taxonomic strings, + scales measurements, and exports ecological diversity indices. + + Args: + input_dir: Path pointing to the source directory containing raw reports. + output_dir: Custom export destination path. Defaults to input_dir parent if None. + keep_human: If True, biological filtering blocks skipping human contamination are disabled. + viruses_only: Restricts the processing scope exclusively to the Viruses domain. + bacteria_only: Restricts the processing scope exclusively to the Bacteria domain. + fungi_only: Restricts the processing scope exclusively to the Fungi kingdom. + archaea_only: Restricts the processing scope exclusively to the Archaea domain. + rarefaction_depth: Absolute uniform read metrics applied during beta-dissimilarity calculation. + seed: Random state state-initializer utilized to force deterministic rarefaction. + overwrite: Overrides local path locks, destroying conflicting outputs matching pipeline subdirs. + + Raises: + FileNotFoundError: Triggered if the declared input resource directory is absent. + FileExistsError: Safety exception raised if targeted locations hold pre-existing outputs + and overwrite locks are set to active. + typer.Exit: Gracefully intercepts runtime aborts if critical files disappear mid-run. + """ + source_dir: Path = input_dir if not source_dir.is_dir(): raise FileNotFoundError(f"Input directory not found: {source_dir}") - out_dir = output_dir if output_dir else source_dir.parent + out_dir: Path = output_dir if output_dir else source_dir.parent out_dir.mkdir(parents=True, exist_ok=True) - existing = [out_dir / d for d in _OUTPUT_SUBDIRS if (out_dir / d).exists()] + # Protect pre-existing data matrices from unintended deletion mutations + existing: list[Path] = [ + out_dir / d for d in _OUTPUT_SUBDIRS if (out_dir / d).exists() + ] if existing and not overwrite: - names = ", ".join(d.name for d in existing) + names: str = ", ".join(d.name for d in existing) raise FileExistsError( f"Output already exists in '{out_dir}' ({names}).\n" "Use --overwrite to overwrite it." @@ -75,61 +129,73 @@ def run_pipeline( shutil.rmtree(d) _log.info("Removed existing directory: %s", d) - intermediate_dir = out_dir / "intermediate" + # Step 1: Initialize structural staging environment layers + intermediate_dir: Path = out_dir / "intermediate" intermediate_dir.mkdir(exist_ok=True) - mpa_dir = intermediate_dir / "mpa" + mpa_dir: Path = intermediate_dir / "mpa" mpa_dir.mkdir(exist_ok=True) + + # Step 2: Compile independent text reports to unified MetaPhlAn format for f in sorted(source_dir.iterdir()): if not f.is_file(): continue if not _is_processable(f): _log.info("Skipping: %s", f.name) continue - out_name = f.stem + ".MPA.TXT" + out_name: str = f.stem + ".MPA.TXT" kreport_to_mpa(f, mpa_dir / out_name, display_header=True) - mpa_files = sorted(mpa_dir.glob("*.MPA.TXT")) + mpa_files: list[Path] = sorted(mpa_dir.glob("*.MPA.TXT")) if not mpa_files: print("Error: no MPA files found after conversion.", file=sys.stderr) raise typer.Exit(code=1) - combined_file = intermediate_dir / "COMBINED.txt" + + # Step 3: Matrix aggregation across multiple samples + combined_file: Path = intermediate_dir / "COMBINED.txt" combine_mpa(mpa_files, combined_file) _log.info("MPA files combined. Output: %s", combined_file) + # Step 4: Isolate targeted biological taxonomic strata and domains split_mpa( - str(combined_file), - str(intermediate_dir), + combined_file, + intermediate_dir, keep_human=keep_human, viruses_only=viruses_only, bacteria_only=bacteria_only, fungi_only=fungi_only, archaea_only=archaea_only, ) - txt_dir = intermediate_dir / "txt" + txt_dir: Path = intermediate_dir / "txt" + # Step 5: Clean prefix tags and syntactic formatting anomalies for txt_file in sorted(txt_dir.glob("counts_*.txt")): - process_files(str(combined_file), str(txt_file)) + process_files(combined_file, txt_file) - counts_dir = out_dir / "counts" + # Step 6: Construct tidy row/column layout structures within CSV tables + counts_dir: Path = out_dir / "counts" counts_dir.mkdir(exist_ok=True) for txt_file in sorted(txt_dir.glob("counts_*.txt")): - csv_file = counts_dir / txt_file.with_suffix(".csv").name - convert_to_csv(str(txt_file), str(csv_file)) + csv_file: Path = counts_dir / txt_file.with_suffix(".csv").name + convert_to_csv(txt_file, csv_file) - rel_abund_dir = out_dir / "rel_abund" + # Step 7: Apply normalization metrics to convert counts to relative distribution percentages + rel_abund_dir: Path = out_dir / "rel_abund" rel_abund_dir.mkdir(exist_ok=True) for csv_file in sorted(counts_dir.glob("counts_*.csv")): - ra_file = rel_abund_dir / csv_file.name.replace("counts_", "ra_") + ra_file: Path = rel_abund_dir / csv_file.name.replace("counts_", "ra_") calculate_rel_abund(csv_file, ra_file) - species_csv = counts_dir / "counts_species.csv" + # Step 8: Parse ecological matrices to capture microbial diversity indices + species_csv: Path = counts_dir / "counts_species.csv" if not species_csv.exists(): print(f"Error: species counts not found: {species_csv}", file=sys.stderr) raise typer.Exit(code=1) - diversity_dir = out_dir / "diversity" + + diversity_dir: Path = out_dir / "diversity" diversity_dir.mkdir(exist_ok=True) - df = pd.read_csv(species_csv, index_col=0) + + df: pd.DataFrame = pd.read_csv(species_csv, index_col=0) calc_alpha_div(df, diversity_dir) calc_beta_div(df, diversity_dir, rarefaction_depth=rarefaction_depth, seed=seed) @@ -196,6 +262,23 @@ def main( help="Overwrite the output directory if it already exists.", ), ) -> None: + """CLI exposure wrapper for the unified run_pipeline automated execution loop. + + Args: + input_dir: Dynamic input target. Passed directly via options. + output_dir: Custom export destination route. + keep_human: Host containment configuration option. + viruses: Viral isolation target selector. + bacteria: Bacterial isolation target selector. + fungi: Mycological isolation target selector. + archaea: Archaeal isolation target selector. + depth: Uniform sequence metric baseline threshold. + seed: Execution randomization initialization state. + overwrite: Overrides data locks protecting directories. + + Raises: + typer.Exit: Aborts execution with a system error code 1 when intercepts system faults. + """ logging.basicConfig(level=logging.INFO, format="%(message)s") try: From e3956255d257bb6d5bda40ccb7790a33bbb7f84b Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Thu, 4 Jun 2026 09:30:52 +0200 Subject: [PATCH 03/25] refactor(utils): add explicit type annotations and docstrings to utils.py --- krakenparser/utils.py | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/krakenparser/utils.py b/krakenparser/utils.py index 7f9a780..1517864 100644 --- a/krakenparser/utils.py +++ b/krakenparser/utils.py @@ -1,10 +1,33 @@ -# krakenparser/utils.py +#!/usr/bin/env python3 +"""Shared utility wrappers and filesystem helpers for the KrakenParser suite. + +This module provides low-level infrastructure operations, such as safe path +resolution and idempotent directory creation, ensuring robust filesystem execution +across diverse operating systems. +""" + from pathlib import Path def ensure_output_dir(path: str | Path, is_file: bool = True) -> Path: - """Create parent directory for a file output, or the directory itself.""" - p = Path(path) - target = p.parent if is_file else p - target.mkdir(parents=True, exist_ok=True) - return p + """Ensure the target directory or parent directory tree exists layout-ready. + + If the output target is designated as a file, this utility creates its + containing parent directory. If designated as a directory, it constructs + the target directory itself. Operations are idempotent (`exist_ok=True`). + + Args: + path: A string path or Path object representing the targeted filesystem entry. + is_file: If True, treats the path as a file and ensures its parent directory + exists. If False, treats the entire path as a directory to create. + + Returns: + Path: A fully instantiated Path object pointing to the original target destination. + """ + path_obj: Path = Path(path) + + # Resolve whether to isolate the parent directory or target the path directly + target_dir: Path = path_obj.parent if is_file else path_obj + target_dir.mkdir(parents=True, exist_ok=True) + + return path_obj From de461ff925cf26970dc6cddaeeff6c6281f7f359 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Thu, 4 Jun 2026 09:31:54 +0200 Subject: [PATCH 04/25] refactor(init): add explicit type annotation to __all__ export sequence --- krakenparser/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/krakenparser/__init__.py b/krakenparser/__init__.py index 2905ccb..11e7468 100755 --- a/krakenparser/__init__.py +++ b/krakenparser/__init__.py @@ -2,7 +2,7 @@ from .kpplot.stackedbar import stacked_barplot from .kpplot.streamgraph import streamgraph -__all__ = [ +__all__: list[str] = [ "stacked_barplot", "streamgraph", "clustermap", From 83f155a83ee489f3dcc3efbe4eaccc81fa3134f0 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Thu, 4 Jun 2026 09:33:54 +0200 Subject: [PATCH 05/25] refactor(counts): modernize matrix transposition utility with Path typing and docstrings --- krakenparser/counts/convert2csv.py | 62 ++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 20 deletions(-) diff --git a/krakenparser/counts/convert2csv.py b/krakenparser/counts/convert2csv.py index d20086c..afd1828 100755 --- a/krakenparser/counts/convert2csv.py +++ b/krakenparser/counts/convert2csv.py @@ -1,6 +1,12 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 +"""Matrix manipulation utility for restructuring metagenomic abundance tables. + +This module converts tab-delimited abundance tables (traditionally structured +with features/taxa as rows and samples as columns) into standardized, +transposed CSV sheets conforming to the tidy data format (samples as rows). +""" + import logging -import sys from pathlib import Path from typing import Optional @@ -9,40 +15,59 @@ from krakenparser.utils import ensure_output_dir -_log = logging.getLogger(__name__) +# Initialize module-level isolated logger +_log: logging.Logger = logging.getLogger(__name__) -app = typer.Typer( +# Dedicated Typer routing application instantiation +app: typer.Typer = typer.Typer( name="csv", add_completion=False, context_settings={"help_option_names": ["-h", "--help"]}, ) -def convert_to_csv(input_file: str, output_file: str) -> None: - in_path = Path(input_file) - if not in_path.is_file(): - raise FileNotFoundError(f"Input file not found: {in_path}") - out_path = ensure_output_dir(output_file, is_file=True) +def convert_to_csv(input_file: Path, output_file: Path) -> None: + """Transpose a tab-separated matrix and export it as a sample-centric CSV. + + Reads a matrix where columns represent samples and rows represent taxa, + performs an algebraic transposition operation (.T), and locks the new row + index under the canonical 'Sample_id' header label. + + Args: + input_file: Path to the validated incoming tab-separated matrix file. + output_file: Target path where the restructured CSV matrix will be dumped. - data = pd.read_csv(in_path, sep="\t", index_col=0) + Raises: + FileNotFoundError: Triggered if the specified input text resource is missing. + """ + if not input_file.is_file(): + raise FileNotFoundError(f"Input file not found: {input_file}") + + out_path: Path = ensure_output_dir(output_file, is_file=True) + + # Load high-dimensional matrix (Rows: Taxa, Columns: Samples) + data: pd.DataFrame = pd.read_csv(input_file, sep="\t", index_col=0) + + # Execute matrix transposition to shift samples to rows (Tidy Data layout) data.T.to_csv(out_path, index_label="Sample_id") - _log.info("Data converted and saved as '%s'.", output_file) + + _log.info("Data successfully transposed and saved to '%s'.", output_file) @app.callback(invoke_without_command=True) def main( ctx: typer.Context, - input_file: Optional[str] = typer.Option( + input_file: Optional[Path] = typer.Option( None, "-i", "--input", - help="Path to the input TXT file. This file should contain sample names in columns and microbial taxa in rows.", + help="Path to the input tab-delimited TXT file (samples in columns, taxa in rows).", ), - output_file: Optional[str] = typer.Option( + output_file: Optional[Path] = typer.Option( None, "-o", "--output", - help="Path to the output CSV file. The script will restructure the data and save it here.", + help="Path to the output transposed CSV file.", ), ) -> None: """Reads a TXT file, reorganizes the data, and converts it into a CSV file.""" @@ -53,16 +78,13 @@ def main( raise typer.Exit() if not input_file or not output_file: - print( - "Error: Missing required options '-i / --input' and '-o / --output'.", - file=sys.stderr, - ) + print("Error: Missing required options '-i / --input' and '-o / --output'.") raise typer.Exit(code=1) try: convert_to_csv(input_file, output_file) except FileNotFoundError as e: - print(f"Error: {e}", file=sys.stderr) + print(f"Error: {e}") raise typer.Exit(code=1) From 7edb694257bb61b24fdfe82f3af2b293fe4b63ce Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Thu, 4 Jun 2026 09:35:38 +0200 Subject: [PATCH 06/25] refactor(counts): add type annotations and docstrings to processing_script.py --- krakenparser/counts/processing_script.py | 103 ++++++++++++++++------- 1 file changed, 72 insertions(+), 31 deletions(-) diff --git a/krakenparser/counts/processing_script.py b/krakenparser/counts/processing_script.py index 065d25e..b3e9608 100755 --- a/krakenparser/counts/processing_script.py +++ b/krakenparser/counts/processing_script.py @@ -1,4 +1,11 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 +"""Post-processing matrix utility for metadata refinement and taxonomic sanitization. + +This module cleans upstream pipeline artifacts by removing technical file extensions +from sample headers and restoring canonical spaces to underscore-separated taxonomic +nomenclature strings (e.g., converting 's__Escherichia_coli' to 'Escherichia coli'). +File mutations are executed via atomic filesystem transactions. +""" import logging import os @@ -9,9 +16,11 @@ import typer -_log = logging.getLogger(__name__) +# Initialize module-level isolated logger +_log: logging.Logger = logging.getLogger(__name__) -app = typer.Typer( +# Dedicated Typer routing application instantiation +app: typer.Typer = typer.Typer( name="process", add_completion=False, context_settings={"help_option_names": ["-h", "--help"]}, @@ -19,63 +28,95 @@ def modify_taxa_names(line: str) -> str: - prefixes = ["s__", "g__", "f__", "o__", "c__", "p__"] + """Sanitize taxonomic names by replacing internal underscores with spaces. + + Scans the line for standard taxonomic rank prefixes (s__, g__, etc.). If found, + the primary taxon descriptor string is decoupled, sanitized of internal + technical underscores, and reconstructed while preserving tailing tab metrics. + + Args: + line: A raw text row from the matrix containing taxonomic descriptors. + + Returns: + str: The structurally preserved string with restored space characters. + """ + prefixes: list[str] = ["s__", "g__", "f__", "o__", "c__", "p__"] for prefix in prefixes: if line.startswith(prefix): - parts = line[len(prefix) :].split("\t") + # Clean string parsing utilizing standard tab separation matrices + parts: list[str] = line.removeprefix(prefix).split("\t") parts[0] = parts[0].replace("_", " ") return "\t".join(parts) return line -def process_files(source_file: str, destination_file: str) -> None: - src_path = Path(source_file) - if not src_path.is_file(): - raise FileNotFoundError(f"Source file not found: {src_path}") - dest_path = Path(destination_file) - if not dest_path.is_file(): - raise FileNotFoundError(f"Destination file not found: {dest_path}") +def process_files(source_file: Path, destination_file: Path) -> None: + """Synchronize matrix headers and sanitize taxonomic profiles atomically. + + Extracts clean cohort descriptors from the header of a source tracker, + applies string cleaning to a targeted taxonomy mapping spreadsheet, + and updates the destination file utilizing atomic replacement blocks. - # Read the first line from the source file and modify it - with open(src_path, "r") as file: - first_line_source = file.readline() - modified_first_line = "\t".join( + Args: + source_file: Validated Path to the template matrix containing pristine headers. + destination_file: Target Path to the file undergoing line-by-line taxonomy cleaning. + + Raises: + FileNotFoundError: Triggered if either the source or destination targets are absent. + """ + if not source_file.is_file(): + raise FileNotFoundError(f"Source file not found: {source_file}") + if not destination_file.is_file(): + raise FileNotFoundError(f"Destination file not found: {destination_file}") + + # Step 1: Read and truncate raw pipeline suffixes from sample headers + with open(source_file, "r", encoding="utf-8") as file: + first_line_source: str = file.readline() + + modified_first_line: str = "\t".join( word.split(".")[0] for word in first_line_source.split() ) - # Read all content from the destination file and modify taxa names - with open(dest_path, "r") as file: - lines = file.readlines() - modified_lines = [modify_taxa_names(line.strip()) for line in lines] + # Step 2: Read targets and map taxonomic updates lazily across lists + with open(destination_file, "r", encoding="utf-8") as file: + lines: list[str] = file.readlines() + + modified_lines: list[str] = [modify_taxa_names(line.strip()) for line in lines] - # Combine the modified first line with the modified content of the destination file - updated_content = modified_first_line + "\n" + "\n".join(modified_lines) + # Step 3: Integrate matrices and commit layout modifications to disk + joined_lines: str = "\n".join(modified_lines) + updated_content: str = f"{modified_first_line}\n{joined_lines}" - # Write atomically: write to a temp file in the same directory, then replace + # Secure atomic writer operations targeting adjacent scratch space regions with tempfile.NamedTemporaryFile( - mode="w", dir=dest_path.parent, delete=False, suffix=".tmp" + mode="w", + dir=destination_file.parent, + delete=False, + suffix=".tmp", + encoding="utf-8", ) as tmp: tmp.write(updated_content) - tmp_path = tmp.name - os.replace(tmp_path, dest_path) + tmp_path: str = tmp.name - _log.info(f"Processed {destination_file} successfully.") + # Commit transactions atomically across POSIX virtual environments + os.replace(tmp_path, destination_file) + _log.info("Processed '%s' successfully.", destination_file) @app.callback(invoke_without_command=True) def main( ctx: typer.Context, - input_file: Optional[str] = typer.Option( + input_file: Optional[Path] = typer.Option( None, "-i", "--input", - help="Path to the source file. This file's first line will be read and modified.", + help="Path to the source file (used to extract and truncate header labels).", ), - output_file: Optional[str] = typer.Option( + output_file: Optional[Path] = typer.Option( None, "-o", "--output", - help="Path to the destination file. This file's contents will be updated with cleaned taxa names.", + help="Path to the destination matrix undergoing taxonomic name sanitation.", ), ) -> None: """Reads a source file, processes its first line, modifies taxa names in a destination file, and updates it.""" From 38245252c0effdbaa1a71447a874cb739876f7f1 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Thu, 4 Jun 2026 09:37:10 +0200 Subject: [PATCH 07/25] refactor(counts): add type annotations and docstrings to split_mpa.py --- krakenparser/counts/split_mpa.py | 120 ++++++++++++++++++++++--------- 1 file changed, 86 insertions(+), 34 deletions(-) diff --git a/krakenparser/counts/split_mpa.py b/krakenparser/counts/split_mpa.py index ac90ef1..ef42763 100644 --- a/krakenparser/counts/split_mpa.py +++ b/krakenparser/counts/split_mpa.py @@ -1,7 +1,10 @@ -#!/usr/bin/env python -"""Split a combined MPA table into per-rank TXT files. +#!/usr/bin/env python3 +"""Decomposition utility to partition master MPA matrices into rank-specific text tables. -Replaces decombine.sh and decombine_viruses.sh. +This module splits combined multi-sample MetaPhlAn files into separate tables grouped +by taxonomic rank (species, genus, family, etc.). It supports on-the-fly filtering +for specific biological domains (e.g., Viruses, Bacteria) and filters out host +contamination profiles using predefined taxonomic blacklists. """ import logging @@ -14,15 +17,18 @@ from krakenparser.utils import ensure_output_dir -_log = logging.getLogger(__name__) +# Initialize module-level isolated logger +_log: logging.Logger = logging.getLogger(__name__) -app = typer.Typer( +# Dedicated Typer routing application instantiation +app: typer.Typer = typer.Typer( name="split", add_completion=False, context_settings={"help_option_names": ["-h", "--help"]}, ) -_RANKS = [ +# Immutable configuration schema mapping ranks, targets, and descendents to drop +_RANKS: list[tuple[str, str, list[str]]] = [ ("species", "s__", []), ("genus", "g__", ["s__"]), ("family", "f__", ["s__", "g__"]), @@ -31,7 +37,8 @@ ("phylum", "p__", ["s__", "g__", "f__", "o__", "c__"]), ] -_HUMAN_MARKERS = frozenset( +# Host/human filtering target taxonomy markers +_HUMAN_MARKERS: frozenset[str] = frozenset( [ "s__Homo_sapiens", "g__Homo", @@ -42,48 +49,88 @@ ] ) -_ACCESSION_RE = re.compile(r"(SRS|SRR|SRX|ERS|ERR|ERX|DRS|DRR|DRX)\d*-") +# Regular expression matching SRA/ENA run technical accession sub-strings +_ACCESSION_RE: re.Pattern[str] = re.compile( + r"(SRS|SRR|SRX|ERS|ERR|ERX|DRS|DRR|DRX)\d*-" +) def _strip_path_prefix(line: str) -> str: - tab = line.find("\t") + """Isolate the terminal taxonomic clade and purge short-read archive prefixes. + + Extracts the right-most node classification component from pipe-separated + lineage string paths and trims technical sequencing metadata tags. + + Args: + line: Raw tab-separated line from an MPA summary matrix. + + Returns: + str: Cleansed and isolated clade metric description row. + """ + tab: int = line.find("\t") if tab == -1: return line path, rest = line[:tab], line[tab:] - pipe = path.rfind("|") - segment = path[pipe + 1 :] if pipe != -1 else path + pipe: int = path.rfind("|") + segment: str = path[pipe + 1 :] if pipe != -1 else path return _ACCESSION_RE.sub("", segment + rest) def _human_in_line(line: str) -> bool: - tab = line.find("\t") - path = line[:tab] if tab != -1 else line - segments = set(path.split("|")) + """Verify if the taxonomic lineage contains human contamination markers. + + Args: + line: Raw text line containing structural pipe-separated classifications. + + Returns: + bool: True if the lineage intersects with monitored human host constraints. + """ + tab: int = line.find("\t") + path: str = line[:tab] if tab != -1 else line + segments: set[str] = set(path.split("|")) return bool(segments & _HUMAN_MARKERS) def split_mpa( - input_file: str, - output_dir: str, + input_file: Path, + output_dir: Path, viruses_only: bool = False, bacteria_only: bool = False, fungi_only: bool = False, archaea_only: bool = False, keep_human: bool = False, ) -> None: - in_path = Path(input_file) - if not in_path.is_file(): - raise FileNotFoundError(f"Input file not found: {in_path}") - out_path = ensure_output_dir(output_dir, is_file=False) + """Deconstruct an MPA layout spreadsheet into separate single-rank count matrices. + + Applies selective biological domain filters, drops non-target sub-clades, + performs host background depletion checks, and exports isolated text matrices + under an independent 'txt' directory layout structure. + + Args: + input_file: Validated Path to the incoming source master MPA file. + output_dir: Path locating the destination output root workspace directory. + viruses_only: If True, blocks all entries missing 'd__Viruses' tokens. + bacteria_only: If True, blocks all entries missing 'd__Bacteria' tokens. + fungi_only: If True, blocks all entries missing 'k__Fungi' tokens. + archaea_only: If True, blocks all entries missing 'd__Archaea' tokens. + keep_human: If True, skips host background depletion steps. + + Raises: + FileNotFoundError: Triggered if the targeted raw matrix file cannot be loaded. + """ + if not input_file.is_file(): + raise FileNotFoundError(f"Input file not found: {input_file}") + + out_path: Path = ensure_output_dir(output_dir, is_file=False) (out_path / "txt").mkdir(exist_ok=True) - all_lines = [ - ln - for ln in in_path.read_text().splitlines() - if not ln.startswith("#") and ln.strip() - ] + # High-performance streaming line extractor skipping comments and layout whitespace + with open(input_file, "r", encoding="utf-8") as fh: + all_lines: list[str] = [ + ln for line in fh if (ln := line.strip()) and not ln.startswith("#") + ] - data_lines = all_lines.copy() + data_lines: list[str] = all_lines.copy() if viruses_only: data_lines = [ln for ln in data_lines if "d__Viruses" in ln] if bacteria_only: @@ -93,17 +140,19 @@ def split_mpa( if archaea_only: data_lines = [ln for ln in data_lines if "d__Archaea" in ln] + # Re-integrate target host sequences if preservation flags are set if keep_human: - human_lines = [ln for ln in all_lines if _human_in_line(ln)] + human_lines: list[str] = [ln for ln in all_lines if _human_in_line(ln)] data_lines = list(dict.fromkeys(data_lines + human_lines)) + # Iteratively evaluate taxons and construct independent files for rank_name, rank_prefix, exclude_prefixes in _RANKS: - result = [] + result: list[str] = [] for line in data_lines: if rank_prefix not in line: continue - if "t__" in line: + if "t__" in line: # Skip raw strain-level markers continue if any(ep in line for ep in exclude_prefixes): continue @@ -111,8 +160,11 @@ def split_mpa( continue result.append(_strip_path_prefix(line)) - out_file = out_path / "txt" / f"counts_{rank_name}.txt" - out_file.write_text("\n".join(result) + ("\n" if result else "")) + out_file: Path = out_path / "txt" / f"counts_{rank_name}.txt" + + # Python 3.10 validation: isolate conditional trailing slashes from f-strings + trailing_newline: str = "\n" if result else "" + out_file.write_text("\n".join(result) + trailing_newline, encoding="utf-8") _log.info("MPA file split successfully. Output stored in %s", output_dir) @@ -120,17 +172,17 @@ def split_mpa( @app.callback(invoke_without_command=True) def main( ctx: typer.Context, - input_file: Optional[str] = typer.Option( + input_file: Optional[Path] = typer.Option( None, "-i", "--input", help="Input combined MPA file.", ), - output_dir: Optional[str] = typer.Option( + output_dir: Optional[Path] = typer.Option( None, "-o", "--output", - help="Output directory.", + help="Output directory root pathway.", ), viruses_only: bool = typer.Option( False, From f190b3e826ae706de7d6ef160bc6050daed4a0c3 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Thu, 4 Jun 2026 09:38:02 +0200 Subject: [PATCH 08/25] refactor(kpplot): expand sub-package initialization namespace and enforce explicit export typing --- krakenparser/kpplot/__init__.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/krakenparser/kpplot/__init__.py b/krakenparser/kpplot/__init__.py index c353a3e..1dd6b4e 100644 --- a/krakenparser/kpplot/__init__.py +++ b/krakenparser/kpplot/__init__.py @@ -1,3 +1,14 @@ from .base import KpPlotBase +from .clustermap import KpClustermap, clustermap +from .stackedbar import KpStackedBarplot, stacked_barplot +from .streamgraph import KpStreamgraph, streamgraph -__all__ = ["KpPlotBase"] +__all__: list[str] = [ + "KpPlotBase", + "KpClustermap", + "clustermap", + "KpStackedBarplot", + "stacked_barplot", + "KpStreamgraph", + "streamgraph", +] From 8989fb224b0d5656b99029d89d6ea35437f0ce3a Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Thu, 4 Jun 2026 09:39:15 +0200 Subject: [PATCH 09/25] refactor(kpplot): implement static type hinting and docstrings in plotting base --- krakenparser/kpplot/base.py | 102 +++++++++++++++++++++++++++--------- 1 file changed, 77 insertions(+), 25 deletions(-) diff --git a/krakenparser/kpplot/base.py b/krakenparser/kpplot/base.py index 8d96aba..cc72858 100644 --- a/krakenparser/kpplot/base.py +++ b/krakenparser/kpplot/base.py @@ -1,37 +1,61 @@ -from typing import Optional +#!/usr/bin/env python3 +"""Base classes and data aggregation utilities for metagenomic visualization. + +This module provides structural baselines for managing Matplotlib canvas states +and executing dataframe transformations, such as merging abundance matrices +with cohort metadata schemas and performing group-level re-normalization. +""" + +from pathlib import Path +from typing import Optional, Union import matplotlib.pyplot as plt import pandas as pd class KpPlotBase: - def __init__(self, fig: plt.Figure, ax: plt.Axes): - self.fig = fig - self.ax = ax + """Base orchestration class managing Matplotlib figure and axis contexts.""" + + def __init__(self, fig: plt.Figure, ax: plt.Axes) -> None: + """Initialize the plot base wrapper with canvas references. + + Args: + fig: The Matplotlib Figure instance acting as the global canvas. + ax: The Matplotlib Axes instance mapping sub-plot coordinate space. + """ + self.fig: plt.Figure = fig + self.ax: plt.Axes = ax def plotfig(self) -> plt.Figure: - # self.fig.tight_layout() + """Render the underlying canvas interactively. + + Returns: + plt.Figure: The updated Matplotlib Figure object. + """ plt.show() return self.fig def savefig( self, - path: str, + path: Union[Path, str], dpi: int = 300, transparent: bool = False, bbox_inches: Optional[str] = "tight", - ): - """ - Save the figure to a file. + ) -> None: + """Commit the current canvas state atomically to a physical image file. - Parameters: - - path: Path to save the figure (e.g. "plot.png", "plot.svg"). - - dpi: Dots per inch (resolution) of the output image. Default is 300. - - transparent: Whether to make saved figure transparent. - - bbox_inches: Bounding box option passed to matplotlib. Default is "tight". + Args: + path: Target location or string path where the layout image is exported. + dpi: Dots per inch managing resolution limits. Defaults to 300. + transparent: Toggles transparency anchors for background regions. + bbox_inches: Boundary box padding constraints. Defaults to "tight". """ + target_path: Path = Path(path) self.fig.savefig( - path, dpi=dpi, transparent=transparent, bbox_inches=bbox_inches + target_path, + dpi=dpi, + transparent=transparent, + bbox_inches=bbox_inches, ) @@ -40,18 +64,46 @@ def aggregate_by_metadata( metadata: pd.DataFrame, metadata_group: str, ) -> pd.DataFrame: - """Merge df with metadata and re-normalise rel_abund_perc per group.""" + """Consolidate abundance samples by cohorts and re-normalize relative profiles. + + Maps discrete sample rows to experimental variables, calculates group-specific + abundance means per taxon, and scales profiles to ensure the sum equals 100%. + + Args: + df: Input DataFrame containing 'Sample_id', 'taxon', and 'rel_abund_perc'. + metadata: Metadata worksheet mapping 'Sample_id' to cohort traits. + metadata_group: Specific targeted feature column label inside metadata. + + Returns: + pd.DataFrame: A transposed tidy dataframe ready for cohort-wide plotting. + + Raises: + ValueError: Triggered if 'Sample_id' or 'metadata_group' columns are missing. + """ if "Sample_id" not in metadata.columns: - raise ValueError("metadata must contain 'Sample_id' column") + raise ValueError( + "Metadata schema violates structural constraints: missing 'Sample_id'." + ) if metadata_group not in metadata.columns: - raise ValueError(f"'{metadata_group}' column not found in metadata") - df = df.merge(metadata[["Sample_id", metadata_group]], on="Sample_id", how="left") - df = ( - df.groupby([metadata_group, "taxon"], as_index=False)["rel_abund_perc"] + raise ValueError( + f"Target cohort variable column absent from metadata: '{metadata_group}'." + ) + + # Step 1: Execute left-join operation to append experimental cohort tags + merged_df: pd.DataFrame = df.merge( + metadata[["Sample_id", metadata_group]], on="Sample_id", how="left" + ) + + # Step 2: Compute arithmetic means for distinct taxonomic categories within groups + grouped_df: pd.DataFrame = ( + merged_df.groupby([metadata_group, "taxon"], as_index=False)["rel_abund_perc"] .mean() .rename(columns={metadata_group: "Sample_id"}) ) - df["rel_abund_perc"] = df.groupby("Sample_id")["rel_abund_perc"].transform( - lambda x: (x / x.sum()) * 100 - ) - return df + + # Step 3: High-performance vectorized transformation to re-normalize profiles to 100% + # Replaces slow lambda functions with native Series grouping division + group_sums = grouped_df.groupby("Sample_id")["rel_abund_perc"].transform("sum") + grouped_df["rel_abund_perc"] = (grouped_df["rel_abund_perc"] / group_sums) * 100 + + return grouped_df From 7073e864fee374f7efd9781848ad7e16dc5ba5ca Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Thu, 4 Jun 2026 09:40:38 +0200 Subject: [PATCH 10/25] refactor(kpplot): add strict literal constraints, type annotations, and docstrings to clustermap.py --- krakenparser/kpplot/clustermap.py | 228 ++++++++++++++++++------------ 1 file changed, 141 insertions(+), 87 deletions(-) diff --git a/krakenparser/kpplot/clustermap.py b/krakenparser/kpplot/clustermap.py index 704bb73..a96d4eb 100644 --- a/krakenparser/kpplot/clustermap.py +++ b/krakenparser/kpplot/clustermap.py @@ -1,114 +1,158 @@ -from typing import List, Optional, Tuple +#!/usr/bin/env python3 +"""Hierarchical clustering and heatmap visualization module for metagenomic profiles. + +This module leverages Seaborn's ClusterGrid matrix engine to compute and render +dendrogram-driven abundance clusterings, enabling rapid detection of co-occurrence +taxonomic patterns and sample cohort similarities. +""" + +from typing import Literal, Optional, Sequence, Tuple import matplotlib.pyplot as plt import pandas as pd import seaborn as sns +from seaborn.matrix import ClusterGrid from .base import KpPlotBase, aggregate_by_metadata class KpClustermap(KpPlotBase): - pass + """Orchestration context wrapper encapsulating Seaborn ClusterGrid matrix states.""" + + def __init__(self, fig: plt.Figure, ax: plt.Axes, grid: ClusterGrid) -> None: + """Initialize the clustermap canvas with extended layout grid context. + + Args: + fig: The Matplotlib Figure container hosting the ClusterGrid layout. + ax: The core underlying Axes mapping the central abundance heatmap. + grid: The raw Seaborn ClusterGrid object for deep layout mutations. + """ + super().__init__(fig, ax) + self.grid: ClusterGrid = grid def clustermap( df: pd.DataFrame, metadata: Optional[pd.DataFrame] = None, metadata_group: Optional[str] = None, - sample_order: Optional[List[str]] = None, + sample_order: Optional[Sequence[str]] = None, clust_linewidths: float = 0.5, clust_linecolor: str = "grey", x_axis: str = "Sample_id", y_axis: str = "taxon", - figsize: Optional[Tuple[int, int]] = None, + figsize: Optional[Tuple[float, float]] = None, cmap: str = "Greens", title: Optional[str] = None, title_fontsize: float = 16.0, title_color: str = "black", - title_weight: str = "normal", - title_style: str = "normal", + title_weight: Literal["normal", "bold", "heavy", "light"] = "normal", + title_style: Literal["normal", "italic", "oblique"] = "normal", xlabel: Optional[str] = None, ylabel: Optional[str] = None, xlabel_fontsize: float = 12.0, ylabel_fontsize: float = 12.0, xlabel_color: str = "black", ylabel_color: str = "black", - xlabel_weight: str = "normal", - ylabel_weight: str = "normal", - xlabel_style: str = "normal", - ylabel_style: str = "normal", + xlabel_weight: Literal["normal", "bold", "heavy", "light"] = "normal", + ylabel_weight: Literal["normal", "bold", "heavy", "light"] = "normal", + xlabel_style: Literal["normal", "italic", "oblique"] = "normal", + ylabel_style: Literal["normal", "italic", "oblique"] = "normal", xticks_rotation: float = 0.0, - xticks_ha: str = "center", + xticks_ha: Literal["left", "right", "center"] = "center", xticks_fontsize: float = 12.0, xticks_color: str = "black", - xticks_weight: str = "normal", - xticks_style: str = "normal", + xticks_weight: Literal["normal", "bold", "heavy", "light"] = "normal", + xticks_style: Literal["normal", "italic", "oblique"] = "normal", yticks_rotation: float = 0.0, - yticks_ha: str = "left", + yticks_ha: Literal["left", "right", "center"] = "left", yticks_fontsize: float = 12.0, yticks_color: str = "black", - yticks_weight: str = "normal", - yticks_style: str = "normal", + yticks_weight: Literal["normal", "bold", "heavy", "light"] = "normal", + yticks_style: Literal["normal", "italic", "oblique"] = "normal", standard_scale: Optional[int] = None, z_score: Optional[int] = None, legend_title: str = "Relative abundance (%)", cbar_pos: Tuple[float, float, float, float] = (1.02, 0.3, 0.03, 0.4), - background_color: str = "white", -): - """ - Generates a customizable clustermap to visualize the relative abundance across samples. - - Parameters: - - df: Pandas DataFrame containing the dataset. - - metadata: Optional DataFrame with sample metadata (must include 'Sample_id'). - - metadata_group: Column in `metadata` to group samples by for aggregation. - - sample_order: Optional list to specify the order of columns (samples) in the heatmap. - - clust_linewidths: Width of the lines that divide cells in the clustermap heatmap. - - clust_linecolor: Color of the grid lines separating the heatmap cells. - - x_axis, y_axis: Column name in `df` to be used for the X- or Y-axis. - - figsize: Tuple (width, height) of the figure. - - cmap: Colormap name (str) or list of colors. - - title: Main title of the plot. - - title_fontsize, title_color, title_weight, title_style: Title text styling. - - xlabel, ylabel: Axis labels. - - xlabel_fontsize, xlabel_color, xlabel_weight, xlabel_style: X-axis label styling. - - ylabel_fontsize, ylabel_color, ylabel_weight, ylabel_style: Y-axis label styling. - - xticks_rotation, xticks_ha: Rotation angle and alignment of X-axis tick labels. - - xticks_fontsize, xticks_color, xticks_weight, xticks_style: X-axis tick label styling. - - yticks_fontsize, yticks_color, yticks_weight, yticks_style: Y-axis tick label styling. - - legend_title: Title for the colorbar legend indicating what the values represent. - - cbar_pos: Tuple (X, Y, width, height) of the colorbar. - - background_color: Background color of the figure. + background_color: Optional[str] = "white", +) -> KpClustermap: + """Generate a highly customizable, publication-grade hierarchical clustermap. + + Transforms microbial abundance worksheets into cross-pivoted correlation matrices, + groups related feature layers using standard Euclidean/Ward metrics, and builds + composite figures containing core heatmaps linked to sample and taxon dendrograms. + + Args: + df: Input DataFrame containing categorical axes and continuous metrics. + metadata: Optional metadata sheet used for categorical sample mapping. + metadata_group: Column header inside metadata used to pool and average targets. + sample_order: Explicit structural layout sequence restricting column rendering. + clust_linewidths: Grid borderline widths dividing independent matrix cells. + clust_linecolor: Palette color map utilized to draw border partitions. + x_axis: Column key mapping individual sample profiles. Defaults to 'Sample_id'. + y_axis: Column key mapping individual microbial features. Defaults to 'taxon'. + figsize: Dimensional width and height tuple managing canvas allocations. + cmap: Color lookup palette or standard Matplotlib string color map identifier. + title: Global chart layout label string text. + title_fontsize, title_color, title_weight, title_style: Title style metrics. + xlabel, ylabel: Text descriptors mapping horizontal and vertical axes. + xlabel_fontsize, ylabel_fontsize: Axis layout font dimensions. + xlabel_color, ylabel_color: Text color variables mapping labels. + xlabel_weight, ylabel_weight: Structural typographic density metrics. + xlabel_style, ylabel_style: Geometric font slope configurations. + xticks_rotation, xticks_ha: Geometric transformation variables mapping x-ticks. + xticks_fontsize, xticks_color, xticks_weight, xticks_style: X-tick text parameters. + yticks_rotation, yticks_ha: Geometric transformation variables mapping y-ticks. + yticks_fontsize, yticks_color, yticks_weight, yticks_style: Y-tick text parameters. + standard_scale: Integer axis reference (0 or 1) applied to normalize matrices. + z_score: Integer axis reference (0 or 1) calculating standard Z score shifts. + legend_title: Context label string rendering adjacent to colorbars. + cbar_pos: Coordinates tracking anchor bounding limits for the legend block. + background_color: Primary layout canvas backdrop color mapping. Returns: - - KpClustermap: An object containing the clustermap figure and axis for customization or saving. - """ + KpClustermap: Context tracking wrapper ready for saving operations. - df = df.copy() + Raises: + ValueError: Triggered if specified sample elements fail data alignment steps. + """ + working_df: pd.DataFrame = df.copy() + # Step 1: Conditionally execute metadata-driven group pooling operations if metadata is not None and metadata_group is not None: - df = aggregate_by_metadata(df, metadata, metadata_group) - - if df[y_axis].dtype == object: - other_mask = df[y_axis].str.startswith("Other") - taxon_order = list(df[other_mask][y_axis].unique()) + sorted( - df[~other_mask][y_axis].unique() + working_df = aggregate_by_metadata(working_df, metadata, metadata_group) + + # Step 2: Enforce structural categorization rules to isolate 'Other' catch-all components + if working_df[y_axis].dtype == object or isinstance( + working_df[y_axis].dtype, pd.CategoricalDtype + ): + unique_taxa: Sequence[str] = working_df[y_axis].unique() + other_mask: Sequence[bool] = [str(t).startswith("Other") for t in unique_taxa] + + taxon_order: list[str] = [ + t for t, m in zip(unique_taxa, other_mask) if m + ] + sorted([t for t, m in zip(unique_taxa, other_mask) if not m]) + working_df[y_axis] = pd.Categorical( + working_df[y_axis], categories=taxon_order, ordered=True ) - df[y_axis] = pd.Categorical(df[y_axis], categories=taxon_order, ordered=True) - pivot = df.pivot(index=y_axis, columns=x_axis, values="rel_abund_perc").fillna(0) + # Step 3: Reshape tabular datasets into algebraic continuous pivot frames + pivot: pd.DataFrame = working_df.pivot( + index=y_axis, columns=x_axis, values="rel_abund_perc" + ).fillna(0.0) + # Step 4: Validate and lock custom user column sequences if provided + col_cluster: bool = True if sample_order is not None: - missing = set(sample_order) - set(pivot.columns) - if missing: - raise ValueError(f"Samples missing from data: {missing}") - pivot = pivot[sample_order] - + missing_samples: set[str] = set(sample_order) - set(pivot.columns) + if missing_samples: + raise ValueError( + f"Samples missing from the clustermap matrix sequence alignment: {missing_samples}" + ) + pivot = pivot[list(sample_order)] col_cluster = False - else: - col_cluster = True - g = sns.clustermap( + # Step 5: Initialize and compute Seaborn clustering grid structures (with original defaults) + g: ClusterGrid = sns.clustermap( pivot, cmap=cmap, figsize=figsize, @@ -123,34 +167,43 @@ def clustermap( col_cluster=col_cluster, ) - g.fig.patch.set_facecolor(background_color) + if background_color is not None: + g.fig.patch.set_facecolor(background_color) - ax = g.ax_heatmap - fig = g.fig + ax: plt.Axes = g.ax_heatmap + fig: plt.Figure = g.fig - ax.set_title( - title, - fontsize=title_fontsize, - color=title_color, - weight=title_weight, - style=title_style, - ) - - ax.set_xlabel( - xlabel, - fontsize=xlabel_fontsize, - color=xlabel_color, - weight=xlabel_weight, - style=xlabel_style, - ) + # Step 6: Apply publication-grade typography modifications to layout canvas + if title: + ax.set_title( + title, + fontsize=title_fontsize, + color=title_color, + weight=title_weight, + style=title_style, + ) - ax.set_ylabel( - ylabel, - fontsize=ylabel_fontsize, - color=ylabel_color, - weight=ylabel_weight, - style=ylabel_style, - ) + if xlabel: + ax.set_xlabel( + xlabel, + fontsize=xlabel_fontsize, + color=xlabel_color, + weight=xlabel_weight, + style=xlabel_style, + ) + else: + ax.set_xlabel("") + + if ylabel: + ax.set_ylabel( + ylabel, + fontsize=ylabel_fontsize, + color=ylabel_color, + weight=ylabel_weight, + style=ylabel_style, + ) + else: + ax.set_ylabel("") plt.setp( ax.get_xticklabels(), @@ -172,5 +225,6 @@ def clustermap( style=yticks_style, ) + # Intercept immediate rendering frames to keep memory blocks sterile plt.close(fig) - return KpClustermap(fig, ax) + return KpClustermap(fig, ax, g) From c8aea1f00493b22a9f56c75c378d184bd5007f3b Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Thu, 4 Jun 2026 09:41:56 +0200 Subject: [PATCH 11/25] refactor(kpplot): add strict literal constraints, type annotations, and docstrings to stackedbar.py --- krakenparser/kpplot/stackedbar.py | 227 +++++++++++++++++++----------- 1 file changed, 141 insertions(+), 86 deletions(-) diff --git a/krakenparser/kpplot/stackedbar.py b/krakenparser/kpplot/stackedbar.py index f86cbcc..5f1d56b 100644 --- a/krakenparser/kpplot/stackedbar.py +++ b/krakenparser/kpplot/stackedbar.py @@ -1,4 +1,12 @@ -from typing import List, Optional, Tuple, Union +#!/usr/bin/env python3 +"""Stacked barplot visualization module for compositional metagenomic profiles. + +This module builds normalized, multi-component stacked column charts tracking +relative taxonomic abundance variations across independent sample coordinates +or aggregated experimental cohorts. +""" + +from typing import Literal, Optional, Sequence, Tuple, Union import matplotlib.pyplot as plt import numpy as np @@ -9,132 +17,174 @@ class KpStackedBarplot(KpPlotBase): - pass + """Orchestration context wrapper encapsulating Matplotlib stacked barplot layouts.""" + + def __init__(self, fig: plt.Figure, ax: plt.Axes) -> None: + """Initialize the stacked barplot canvas with layout metrics. + + Args: + fig: The Matplotlib Figure container hosting the drawing canvas. + ax: The core underlying Axes coordinate grid mapper. + """ + super().__init__(fig, ax) def stacked_barplot( - df, + df: pd.DataFrame, metadata: Optional[pd.DataFrame] = None, metadata_group: Optional[str] = None, - sample_order: Optional[List[str]] = None, - figsize: Tuple[int, int] = (14, 7), - cmap: Optional[Union[str, List[str]]] = "tab20", + sample_order: Optional[Sequence[str]] = None, + figsize: Tuple[float, float] = (14.0, 7.0), + cmap: Union[str, Sequence[str]] = "tab20", bar_width: float = 0.6, edgecolor: str = "black", edge_linewidth: float = 0.3, title: Optional[str] = None, title_fontsize: float = 16.0, title_color: str = "black", - title_weight: str = "normal", - title_style: str = "normal", + title_weight: Literal["normal", "bold", "heavy", "light"] = "normal", + title_style: Literal["normal", "italic", "oblique"] = "normal", xlabel: str = "Samples", xlabel_fontsize: float = 12.0, xlabel_color: str = "black", - xlabel_weight: str = "normal", - xlabel_style: str = "normal", + xlabel_weight: Literal["normal", "bold", "heavy", "light"] = "normal", + xlabel_style: Literal["normal", "italic", "oblique"] = "normal", ylabel: str = "Relative Abundance (%)", ylabel_fontsize: float = 12.0, ylabel_color: str = "black", - ylabel_weight: str = "normal", - ylabel_style: str = "normal", + ylabel_weight: Literal["normal", "bold", "heavy", "light"] = "normal", + ylabel_style: Literal["normal", "italic", "oblique"] = "normal", xticks_rotation: float = 0.0, - xticks_ha: str = "center", + xticks_ha: Literal["left", "right", "center"] = "center", xticks_fontsize: float = 12.0, xticks_color: str = "black", - xticks_weight: str = "normal", - xticks_style: str = "normal", - background_color="white", + xticks_weight: Literal["normal", "bold", "heavy", "light"] = "normal", + xticks_style: Literal["normal", "italic", "oblique"] = "normal", + background_color: Optional[str] = "white", grid: bool = True, grid_linestyle: str = "--", grid_alpha: float = 0.7, legend_title: str = "Taxon", legend_fontsize: float = 9.0, - legend_fontstyle: str = "italic", + legend_fontstyle: Literal["normal", "italic", "oblique"] = "italic", legend_loc: str = "upper left", - legend_bbox: Tuple[float, float] = (1.05, 1), + legend_bbox: Tuple[float, float] = (1.05, 1.0), show_legend: bool = True, -): - """ - Generates a customizable stacked bar plot showing relative abundance values across samples. - - Parameters: - - df: Pandas DataFrame containing the dataset. - - metadata: Optional DataFrame with sample metadata (must include 'Sample_id'). - - metadata_group: Column in `metadata` to group samples by for aggregation. - - sample_order: Optional list to specify the order of columns (samples) in the barplot. - - figsize: Tuple (width, height) of the figure. - - cmap: Colormap name (str) or list of colors. - - bar_width: Width parameter for bars. - - edgecolor: Color of the edges (borders) drawn around each stacked area in the stacked barplot. - - edge_linewidth: Width of the edge lines around each stacked area. - - title: Title of the plot. - - title_fontsize, title_color, title_weight, title_style: Title styling. - - xlabel, ylabel: Axis labels. - - xlabel_fontsize, xlabel_color, xlabel_weight, xlabel_style: X-axis label styling. - - ylabel_fontsize, ylabel_color, ylabel_weight, ylabel_style: Y-axis label styling. - - xticks_rotation, xticks_ha: Rotation angle and alignment of x-axis tick labels. - - xticks_fontsize, xticks_color, xticks_weight, xticks_style: X-axis tick label styling. - - background_color: Background color of the figure. - - grid: Whether to display a grid. - - grid_color, grid_linestyle, grid_alpha: Grid styling. - - legend_fontsize: Font size for legend labels. - - legend_loc: Position of the legend. - - legend_bbox: Position of the legend box. - - show_legend: Whether to display the legend. +) -> KpStackedBarplot: + """Generate a publication-grade customizable stacked barplot for relative abundances. + + Transforms taxonomic profiles into aligned cross-pivoted matrix structures, + orders categorical taxa to force 'Other' catch-all categories to the top/bottom, + applies custom color mapping schemas, and generates stacked column distributions. + + Args: + df: Input DataFrame containing tracking metrics ('Sample_id', 'taxon', 'rel_abund_perc'). + metadata: Optional worksheet schema mapping samples to experimental variables. + metadata_group: Column header within metadata used for cross-sample aggregation. + sample_order: Explicit layout sequence locking the display order on the X-axis. + figsize: Geometric allocation limits (width, height) defining canvas borders. + cmap: Target string name lookup or a sequential list of direct hexadecimal colors. + bar_width: Horizontal width metric allocated to independent drawing column bars. + edgecolor: Border styling color outline separating adjacent stacked blocks. + edge_linewidth: Thickness parameter of border outlines enclosing drawing cells. + title: Global text message identifier rendering above the drawing matrix. + title_fontsize, title_color, title_weight, title_style: Font properties for title. + xlabel, ylabel: Text content values mapping coordinates descriptors. + xlabel_fontsize, ylabel_fontsize: Typography scale indices. + xlabel_color, ylabel_color: Text color variables mapping target labels. + xlabel_weight, ylabel_weight: Structural typographic density metrics. + xlabel_style, ylabel_style: Geometric font slope configurations. + xticks_rotation, xticks_ha: Position variables mapping target X tick attributes. + xticks_fontsize, xticks_color, xticks_weight, xticks_style: X-tick typography rules. + background_color: Primary layout canvas backdrop color mapping. + grid: Toggles background coordinate reference line structures. + grid_linestyle: Grid line texture rendering parameter. + grid_alpha: Opacity index managing visibility bounds of grid elements. + legend_title: Display title contextual wrapper tracking color keys. + legend_fontsize, legend_fontstyle: Typography rules mapping legends. + legend_loc: Positional anchoring code identifier tracking layout widgets. + legend_bbox: Coordinate anchor box offsets defining bounding regions for legends. + show_legend: If False, completely suppresses widget layer execution. Returns: - - KpStackedBarplot: An object containing the barplot figure and axis for customization or saving. - """ + KpStackedBarplot: Container instance holding references to optimized figures. - df = df.copy() + Raises: + ValueError: Triggered if sample targets or color map arrays violate alignment steps. + """ + working_df: pd.DataFrame = df.copy() + # Step 1: Conditionally execute group aggregation operations if metadata is not None and metadata_group is not None: - df = aggregate_by_metadata(df, metadata, metadata_group) + working_df = aggregate_by_metadata(working_df, metadata, metadata_group) + # Step 2: Validate sample elements and apply strict ordered categorical indices if sample_order is not None: - missing = set(sample_order) - set(df["Sample_id"].unique()) - if missing: - raise ValueError(f"Samples missing from data: {missing}") - df = df[df["Sample_id"].isin(sample_order)] - df["Sample_id"] = pd.Categorical( - df["Sample_id"], categories=sample_order, ordered=True + missing_samples: set[str] = set(sample_order) - set( + working_df["Sample_id"].unique() + ) + if missing_samples: + raise ValueError( + f"Samples missing from the data matrix sequence alignment: {missing_samples}" + ) + working_df = working_df[working_df["Sample_id"].isin(sample_order)].copy() + working_df["Sample_id"] = pd.Categorical( + working_df["Sample_id"], categories=list(sample_order), ordered=True ) - df["taxon"] = pd.Categorical( - df["taxon"], - categories=sorted( - df["taxon"].unique(), key=lambda x: (x.startswith("Other"), x) - ), - ordered=True, + + # Step 3: Extract and structure taxonomic sort categories ensuring 'Other' falls last + unique_taxa: Sequence[str] = working_df["taxon"].unique() + other_taxa: list[str] = sorted( + [t for t in unique_taxa if str(t).startswith("Other")] + ) + regular_taxa: list[str] = sorted( + [t for t in unique_taxa if not str(t).startswith("Other")] ) - df_plot = df.pivot( + taxon_categories: list[str] = regular_taxa + other_taxa + + working_df["taxon"] = pd.Categorical( + working_df["taxon"], categories=taxon_categories, ordered=True + ) + + # Step 4: Reshape layout spreadsheet structure via pivot operations + df_plot: pd.DataFrame = working_df.pivot( index="Sample_id", columns="taxon", values="rel_abund_perc" - ).fillna(0) + ).fillna(0.0) + # Step 5: Establish palette map dictionaries compliant with static analysis if isinstance(cmap, str): - color_dict = dict( - zip(df_plot.columns, sns.color_palette(cmap, n_colors=len(df_plot.columns))) - ) - elif isinstance(cmap, list): + palette_colors = sns.color_palette(cmap, n_colors=len(df_plot.columns)) + color_dict: dict[str, Union[str, tuple[float, ...]]] = { + str(col): color for col, color in zip(df_plot.columns, palette_colors) + } + elif isinstance(cmap, (list, tuple, np.ndarray, pd.Series)) or hasattr( + cmap, "__iter__" + ): if len(cmap) < len(df_plot.columns): raise ValueError( - f"cmap has {len(cmap)} colors but the data has {len(df_plot.columns)} " - "taxa; provide at least as many colors as taxa." + f"Color allocation array size mismatch: custom cmap palette has {len(cmap)} blocks " + f"but target dataset maps {len(df_plot.columns)} taxonomic items." ) - color_dict = dict(zip(df_plot.columns, cmap)) + color_dict = {str(col): color for col, color in zip(df_plot.columns, cmap)} else: - raise ValueError("cmap must be a str or a list of colors") + raise ValueError( + "Parameter 'cmap' violates validation constraints: must map to str or sequence." + ) + # Apply specialized bioinformatic neutral-grey mapping overrides targeting unresolved fragments for col in color_dict: - if col.lower().startswith("other"): + if str(col).lower().startswith("other"): color_dict[col] = "#837b8d" - colors = [color_dict[col] for col in df_plot.columns] + colors_list = [color_dict[col] for col in df_plot.columns] + # Step 6: Initialize structural subplots container canvas layers fig, ax = plt.subplots(figsize=figsize, facecolor=background_color) + df_plot.plot( kind="bar", stacked=True, - color=colors, + color=colors_list, ax=ax, edgecolor=edgecolor, linewidth=edge_linewidth, @@ -142,13 +192,15 @@ def stacked_barplot( zorder=3, ) - ax.set_title( - title, - fontsize=title_fontsize, - color=title_color, - weight=title_weight, - style=title_style, - ) + # Step 7: Apply customized typography parameters to coordinate boundaries + if title: + ax.set_title( + title, + fontsize=title_fontsize, + color=title_color, + weight=title_weight, + style=title_style, + ) ax.set_xlabel( xlabel, fontsize=xlabel_fontsize, @@ -164,6 +216,7 @@ def stacked_barplot( style=ylabel_style, ) + # Step 8: Build legend overlays if requested by execution flags if show_legend: legend = ax.legend( title=legend_title, @@ -171,17 +224,19 @@ def stacked_barplot( loc=legend_loc, fontsize=legend_fontsize, ) - for text in legend.get_texts(): - text.set_fontstyle(legend_fontstyle) + if legend: + for text_node in legend.get_texts(): + text_node.set_fontstyle(legend_fontstyle) if grid: ax.grid(axis="y", linestyle=grid_linestyle, alpha=grid_alpha, zorder=0) + # Step 9: Configure ticks geometric parameters and close active canvas stream descriptors positions = np.arange(len(df_plot.index)) - labels = df_plot.index.tolist() + labels_list = df_plot.index.tolist() plt.xticks( positions, - labels, + labels_list, rotation=xticks_rotation, ha=xticks_ha, fontsize=xticks_fontsize, From b5dcee42d6c0d5062776a9ab72974fa538d1dd0d Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Thu, 4 Jun 2026 09:42:54 +0200 Subject: [PATCH 12/25] refactor(kpplot): add strict literal constraints, type annotations, and docstrings to streamgraph.py --- krakenparser/kpplot/streamgraph.py | 238 ++++++++++++++++++----------- 1 file changed, 149 insertions(+), 89 deletions(-) diff --git a/krakenparser/kpplot/streamgraph.py b/krakenparser/kpplot/streamgraph.py index f858be0..6a2719e 100644 --- a/krakenparser/kpplot/streamgraph.py +++ b/krakenparser/kpplot/streamgraph.py @@ -1,4 +1,12 @@ -from typing import List, Optional, Tuple, Union +#!/usr/bin/env python3 +"""Streamgraph visualization module for continuous-like metagenomic cohort profiles. + +This module renders smooth, contiguous stacked area charts representing +the progression and shifts of relative taxonomic abundances across samples +or grouped metadata categories. +""" + +from typing import Any, Literal, Optional, Sequence, Tuple, Union import matplotlib.pyplot as plt import numpy as np @@ -9,16 +17,25 @@ class KpStreamgraph(KpPlotBase): - pass + """Orchestration context wrapper encapsulating Matplotlib streamgraph layouts.""" + + def __init__(self, fig: plt.Figure, ax: plt.Axes) -> None: + """Initialize the streamgraph canvas with layout metrics. + + Args: + fig: The Matplotlib Figure container hosting the drawing canvas. + ax: The core underlying Axes coordinate grid mapper. + """ + super().__init__(fig, ax) def streamgraph( - df, + df: pd.DataFrame, metadata: Optional[pd.DataFrame] = None, metadata_group: Optional[str] = None, - sample_order: Optional[List[str]] = None, - figsize: Tuple[int, int] = (14, 7), - cmap: Optional[Union[str, List[str]]] = "tab20", + sample_order: Optional[Sequence[str]] = None, + figsize: Tuple[float, float] = (14.0, 7.0), + cmap: Union[str, Sequence[str]] = "tab20", bar_width: float = 0.6, fill_alpha: float = 1.0, edgecolor: Optional[str] = None, @@ -26,114 +43,150 @@ def streamgraph( title: Optional[str] = None, title_fontsize: float = 16.0, title_color: str = "black", - title_weight: str = "normal", - title_style: str = "normal", + title_weight: Literal["normal", "bold", "heavy", "light"] = "normal", + title_style: Literal["normal", "italic", "oblique"] = "normal", xlabel: str = "Samples", xlabel_fontsize: float = 12.0, xlabel_color: str = "black", - xlabel_weight: str = "normal", - xlabel_style: str = "normal", + xlabel_weight: Literal["normal", "bold", "heavy", "light"] = "normal", + xlabel_style: Literal["normal", "italic", "oblique"] = "normal", ylabel: str = "Relative Abundance (%)", ylabel_fontsize: float = 12.0, ylabel_color: str = "black", - ylabel_weight: str = "normal", - ylabel_style: str = "normal", + ylabel_weight: Literal["normal", "bold", "heavy", "light"] = "normal", + ylabel_style: Literal["normal", "italic", "oblique"] = "normal", xticks_rotation: float = 0.0, - xticks_ha: str = "center", + xticks_ha: Literal["left", "right", "center"] = "center", xticks_fontsize: float = 12.0, xticks_color: str = "black", - xticks_weight: str = "normal", - xticks_style: str = "normal", - background_color="white", + xticks_weight: Literal["normal", "bold", "heavy", "light"] = "normal", + xticks_style: Literal["normal", "italic", "oblique"] = "normal", + background_color: Optional[str] = "white", grid: bool = True, grid_linestyle: str = "--", grid_alpha: float = 0.7, legend_title: str = "Taxon", legend_fontsize: float = 9.0, - legend_fontstyle: str = "italic", + legend_fontstyle: Literal["normal", "italic", "oblique"] = "italic", legend_loc: str = "upper left", - legend_bbox: Tuple[float, float] = (1.05, 1), + legend_bbox: Tuple[float, float] = (1.05, 1.0), show_legend: bool = True, -): - """ - Generates a customizable streamgraph plot showing relative abundance values across samples. - - Parameters: - - df: Pandas DataFrame containing the dataset. - - metadata: Optional DataFrame with sample metadata (must include 'Sample_id'). - - metadata_group: Column in `metadata` to group samples by for aggregation. - - sample_order: Optional list to specify the order of columns (samples) in the heatmap. - - figsize: Tuple (width, height) of the figure. - - cmap: Colormap name (str) or list of colors. - - fill_alpha: Transparency of the filled areas. - - edgecolor: Color of the edges (borders) drawn around each stacked area in the streamgraph. - - edge_linewidth: Width of the edge lines around each stacked area. - - title: Title of the plot. - - title_fontsize, title_color, title_weight, title_style: Title styling. - - xlabel, ylabel: Axis labels. - - xlabel_fontsize, xlabel_color, xlabel_weight, xlabel_style: X-axis label styling. - - ylabel_fontsize, ylabel_color, ylabel_weight, ylabel_style: Y-axis label styling. - - xticks_rotation, xticks_ha: Rotation angle and alignment of x-axis tick labels. - - xticks_fontsize, xticks_color, xticks_weight, xticks_style: X-axis tick label styling. - - background_color: Background color of the figure. - - grid: Whether to display a grid. - - grid_color, grid_linestyle, grid_linewidth: Grid styling. - - legend_fontsize: Font size for legend labels. - - legend_loc: Position of the legend. - - legend_bbox: Position of the legend box. - - show_legend: Whether to display the legend. +) -> KpStreamgraph: + """Generate a highly customizable streamgraph/stacked-area plot for relative abundances. + + Transforms microbial datasets into aligned matrices, formats taxonomic categories + ensuring 'Other' components sink to the baseline, flattens continuous area coordinates, + and handles custom color palettes without static analysis overloads. + + Args: + df: Input DataFrame containing tracking metrics ('Sample_id', 'taxon', 'rel_abund_perc'). + metadata: Optional worksheet schema mapping samples to experimental variables. + metadata_group: Column header within metadata used for cross-sample aggregation. + sample_order: Explicit layout sequence locking the display order on the X-axis. + figsize: Geometric allocation limits (width, height) defining canvas borders. + cmap: Target string name lookup or a sequential list of direct hexadecimal colors. + bar_width: Horizontal span coefficient used to step and duplicate area limits. + fill_alpha: Opacity index managing transparency limits of the filled polygons. + edgecolor: Border styling color outline separating adjacent stacked streams. + edge_linewidth: Thickness parameter of border outlines enclosing stream paths. + title: Global text message identifier rendering above the drawing matrix. + title_fontsize, title_color, title_weight, title_style: Font properties for title. + xlabel, ylabel: Text content values mapping coordinates descriptors. + xlabel_fontsize, ylabel_fontsize: Typography scale indices. + xlabel_color, ylabel_color: Text color variables mapping target labels. + xlabel_weight, ylabel_weight: Structural typographic density metrics. + xlabel_style, ylabel_style: Geometric font slope configurations. + xticks_rotation, xticks_ha: Position variables mapping target X tick attributes. + xticks_fontsize, xticks_color, xticks_weight, xticks_style: X-tick typography rules. + background_color: Primary layout canvas backdrop color mapping. + grid: Toggles background coordinate reference line structures. + grid_linestyle: Grid line texture rendering parameter. + grid_alpha: Opacity index managing visibility bounds of grid elements. + legend_title: Display title contextual wrapper tracking color keys. + legend_fontsize, legend_fontstyle: Typography rules mapping legends. + legend_loc: Positional anchoring code identifier tracking layout widgets. + legend_bbox: Coordinate anchor box offsets defining bounding regions for legends. + show_legend: If False, completely suppresses widget layer execution. Returns: - - KpStreamgraph: An object containing the streamgraph figure and axis for customization or saving. - """ + KpStreamgraph: Container instance holding references to optimized figures. - df = df.copy() + Raises: + ValueError: Triggered if sample targets or color map arrays violate alignment steps. + """ + working_df: pd.DataFrame = df.copy() + # Step 1: Conditionally execute group aggregation operations if metadata is not None and metadata_group is not None: - df = aggregate_by_metadata(df, metadata, metadata_group) + working_df = aggregate_by_metadata(working_df, metadata, metadata_group) + # Step 2: Validate sample elements and apply strict ordered categorical indices if sample_order is not None: - missing = set(sample_order) - set(df["Sample_id"].unique()) - if missing: - raise ValueError(f"Samples missing from data: {missing}") - df = df[df["Sample_id"].isin(sample_order)] - df["Sample_id"] = pd.Categorical( - df["Sample_id"], categories=sample_order, ordered=True + missing_samples: set[str] = set(sample_order) - set( + working_df["Sample_id"].unique() + ) + if missing_samples: + raise ValueError( + f"Samples missing from the data matrix sequence alignment: missing {missing_samples}" + ) + working_df = working_df[working_df["Sample_id"].isin(sample_order)].copy() + working_df["Sample_id"] = pd.Categorical( + working_df["Sample_id"], categories=list(sample_order), ordered=True ) - df["taxon"] = pd.Categorical( - df["taxon"], - categories=sorted( - df["taxon"].unique(), key=lambda x: (x.startswith("Other"), x) - ), - ordered=True, + + # Step 3: Extract and structure taxonomic sort categories ensuring 'Other' falls last + unique_taxa: Sequence[str] = working_df["taxon"].unique() + other_taxa: list[str] = sorted( + [t for t in unique_taxa if str(t).startswith("Other")] + ) + regular_taxa: list[str] = sorted( + [t for t in unique_taxa if not str(t).startswith("Other")] + ) + taxon_categories: list[str] = regular_taxa + other_taxa + + working_df["taxon"] = pd.Categorical( + working_df["taxon"], categories=taxon_categories, ordered=True ) - df_plot = df.pivot( + + # Step 4: Reshape layout spreadsheet structure via pivot operations + df_plot: pd.DataFrame = working_df.pivot( index="Sample_id", columns="taxon", values="rel_abund_perc" - ).fillna(0) + ).fillna(0.0) + # Step 5: Establish palette map dictionaries compliant with static analysis if isinstance(cmap, str): - color_dict = dict( - zip(df_plot.columns, sns.color_palette(cmap, n_colors=len(df_plot.columns))) - ) - elif isinstance(cmap, list): + palette_colors = sns.color_palette(cmap, n_colors=len(df_plot.columns)) + color_dict: dict[str, Any] = { + str(col): color for col, color in zip(df_plot.columns, palette_colors) + } + elif isinstance(cmap, (list, tuple, np.ndarray, pd.Series)) or hasattr( + cmap, "__iter__" + ): if len(cmap) < len(df_plot.columns): raise ValueError( - f"cmap has {len(cmap)} colors but the data has {len(df_plot.columns)} " - "taxa; provide at least as many colors as taxa." + f"Color allocation array size mismatch: custom cmap palette has {len(cmap)} blocks " + f"but target dataset maps {len(df_plot.columns)} taxonomic items." ) - color_dict = dict(zip(df_plot.columns, cmap)) + color_dict = {str(col): color for col, color in zip(df_plot.columns, cmap)} else: - raise ValueError("cmap must be a str or a list of colors") + raise ValueError( + "Parameter 'cmap' violates validation constraints: must map to str or sequence." + ) + # Apply specialized bioinformatic neutral-grey mapping overrides targeting unresolved fragments for col in color_dict: - if col.lower().startswith("other"): + if str(col).lower().startswith("other"): color_dict[col] = "#837b8d" - colors = [color_dict[col] for col in df_plot.columns] + colors_list: list[Any] = [color_dict[col] for col in df_plot.columns] + # Step 6: Flat coordinates preparation for stream area alignment centers = np.arange(len(df_plot.index)) - xs = np.column_stack((centers - bar_width / 2, centers + bar_width / 2)).flatten() + xs = np.column_stack( + (centers - bar_width / 2.0, centers + bar_width / 2.0) + ).flatten() + # Step 7: Initialize structural container canvas layers fig, ax = plt.subplots(figsize=figsize, facecolor=background_color) ys = np.repeat(df_plot.values.T, 2, axis=1) @@ -141,22 +194,27 @@ def streamgraph( xs, ys, labels=df_plot.columns, - colors=colors, + colors=colors_list, alpha=fill_alpha, zorder=3, ) + resolved_edgecolor: str = edgecolor if edgecolor is not None else "none" + + # Atomic configurations updates on polygon streams objects for poly in layers: - poly.set_edgecolor(edgecolor) + poly.set_edgecolor(resolved_edgecolor) poly.set_linewidth(edge_linewidth) - ax.set_title( - title, - fontsize=title_fontsize, - color=title_color, - weight=title_weight, - style=title_style, - ) + # Step 8: Apply customized typography parameters to coordinate boundaries + if title: + ax.set_title( + title, + fontsize=title_fontsize, + color=title_color, + weight=title_weight, + style=title_style, + ) ax.set_xlabel( xlabel, fontsize=xlabel_fontsize, @@ -172,6 +230,7 @@ def streamgraph( style=ylabel_style, ) + # Step 9: Build legend overlays if requested by execution flags if show_legend: legend = ax.legend( title=legend_title, @@ -179,16 +238,18 @@ def streamgraph( loc=legend_loc, fontsize=legend_fontsize, ) - for text in legend.get_texts(): - text.set_fontstyle(legend_fontstyle) + if legend: + for text_node in legend.get_texts(): + text_node.set_fontstyle(legend_fontstyle) if grid: ax.grid(axis="y", linestyle=grid_linestyle, alpha=grid_alpha, zorder=0) - labels = df_plot.index.tolist() + # Step 10: Configure ticks geometric parameters and close active canvas streams + labels_list = df_plot.index.tolist() plt.xticks( centers, - labels, + labels_list, rotation=xticks_rotation, ha=xticks_ha, fontsize=xticks_fontsize, @@ -200,5 +261,4 @@ def streamgraph( ax.set_xlim(-0.5, len(df_plot.index) - 0.5) plt.close(fig) - return KpStreamgraph(fig, ax) From fe6464db669d741dc7303d9b909cdbfb581b78a8 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Thu, 4 Jun 2026 09:44:29 +0200 Subject: [PATCH 13/25] refactor(mpa): implement explicit type hints, and docstrings in mpa_table.py --- krakenparser/mpa/mpa_table.py | 71 ++++++++++++++++++++++++----------- 1 file changed, 50 insertions(+), 21 deletions(-) diff --git a/krakenparser/mpa/mpa_table.py b/krakenparser/mpa/mpa_table.py index 2c30e79..a6f13b7 100644 --- a/krakenparser/mpa/mpa_table.py +++ b/krakenparser/mpa/mpa_table.py @@ -1,5 +1,11 @@ -#!/usr/bin/env python -"""Combine multiple MPA-format files into a single merged table.""" +#!/usr/bin/env python3 +"""Aggregation engine for merging multiple MetaPhlAn (MPA) files into a unified matrix. + +This module parses multi-sample taxonomic report files formatted in the MetaPhlAn +style, extracts their respective abundance sequences, tracks phylogenetic tree +parent-child relationships, and performs a stack-based traversal to output a +structurally ordered, tab-delimited master count matrix. +""" import logging import sys @@ -10,9 +16,11 @@ from krakenparser.utils import ensure_output_dir -_log = logging.getLogger(__name__) +# Initialize module-level isolated logger +_log: logging.Logger = logging.getLogger(__name__) -app = typer.Typer( +# Dedicated Typer routing application instantiation +app: typer.Typer = typer.Typer( name="combine", add_completion=False, context_settings={"help_option_names": ["-h", "--help"]}, @@ -20,31 +28,49 @@ def combine_mpa(in_files: list[Path], o_file: Path) -> None: - out_path = ensure_output_dir(str(o_file), is_file=True) + """Merge separate MPA taxonomic distribution tables into a single master matrix. + + Parses header metadata strings to resolve human-readable sample IDs, caches + hierarchical dependencies to maintain strict lineage orientation, and combines + individual abundance values into an integrated layout table. + + Args: + in_files: List of validated Path objects directing to sample MPA files. + o_file: Target Path where the combined tab-delimited table will be written. + Raises: + FileNotFoundError: Triggered if any referenced input file is absent. + """ + out_path: Path = ensure_output_dir(o_file, is_file=True) + + # Architectural storage definitions for parsing alignment graphs samples: dict[int, str] = {} values: dict[str, dict[int, str]] = {} parent2child: dict[str, list[str]] = {} toparse: list[str] = [] - sample_count = 0 + sample_count: int = 0 _log.info("Number of files to parse: %d", len(in_files)) + # Atomic verification step protecting IO transactions for in_path in in_files: if not in_path.is_file(): raise FileNotFoundError(f"Input file not found: {in_path}") + # Step 1: Scan individual reports and populate hierarchical graphs for in_path in in_files: sample_count += 1 - sample_name = f"Sample #{sample_count}" + sample_name: str = f"Sample #{sample_count}" - with open(in_path) as fh: + with open(in_path, encoding="utf-8") as fh: for line in fh: line = line.rstrip("\n") if not line: continue + + # Isolate and extract sample identifier mapping embedded within headers if line.startswith("#"): - cols = line.split("\t") + cols: list[str] = line.split("\t") if len(cols) >= 2: sample_name = cols[-1] continue @@ -54,10 +80,11 @@ def combine_mpa(in_files: list[Path], o_file: Path) -> None: continue classification, val = cols[0], cols[1] - split_vals = classification.split("|") - curr_parent = "" + # Resolve lineage parent node identities to guarantee structural order + split_vals: list[str] = classification.split("|") + curr_parent: str = "" for i in range(len(split_vals)): - test_val = "|".join(split_vals[:i]) + test_val: str = "|".join(split_vals[:i]) if test_val in values: curr_parent = test_val @@ -76,22 +103,24 @@ def combine_mpa(in_files: list[Path], o_file: Path) -> None: samples[sample_count] = sample_name - n_taxa = len(values) + n_taxa: int = len(values) _log.info("Number of classifications to write: %d", n_taxa) - count_written = 0 - with open(out_path, "w") as fh: - header = "#Classification\t" + "\t".join( + # Step 2: Traverse graph using a stack buffer to stream records layout-ready + count_written: int = 0 + with open(out_path, "w", encoding="utf-8") as fh: + header: str = "#Classification\t" + "\t".join( samples[i] for i in range(1, sample_count + 1) ) fh.write(header + "\n") - stack = list(toparse) + stack: list[str] = list(toparse) while stack: - curr = stack.pop(0) + curr: str = stack.pop(0) if curr in parent2child: stack = parent2child[curr] + stack - row = "\t".join( + + row: str = "\t".join( values[curr].get(i, "0") for i in range(1, sample_count + 1) ) fh.write(curr + "\t" + row + "\n") @@ -107,13 +136,13 @@ def main( None, "-i", "--input", - help="Input MPA files (one per sample). Повторите флаг -i для каждого файла.", + help="Input MPA files (one per sample). Repeat the '-i' option for multiple files.", ), o_file: Optional[Path] = typer.Option( None, "-o", "--output", - help="Output merged MPA file.", + help="Output merged MPA file path.", ), ) -> None: """Combine MPA files into a single tab-delimited table.""" From 0560267c7b1734eafa917a4dd2470d63e9381d59 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Thu, 4 Jun 2026 09:47:03 +0200 Subject: [PATCH 14/25] refactor(mpa): add strict type annotations, context managers, and docstrings to transform2mpa.py --- krakenparser/mpa/transform2mpa.py | 135 +++++++++++++++++++----------- 1 file changed, 85 insertions(+), 50 deletions(-) diff --git a/krakenparser/mpa/transform2mpa.py b/krakenparser/mpa/transform2mpa.py index 1ec5fd7..a64de28 100644 --- a/krakenparser/mpa/transform2mpa.py +++ b/krakenparser/mpa/transform2mpa.py @@ -1,51 +1,64 @@ -#!/usr/bin/env python -"""Convert a Kraken2 report to MetaPhlAn (MPA) format.""" +#!/usr/bin/env python3 +"""Taxonomic format converter translating Kraken2 reports to MetaPhlAn (MPA) layout. + +This module parses standard space-indented hierarchical Kraken2 and KrakenUniq output +reports, tracks taxonomic depth changes through parent lineage state machines, +and converts records into pipe-separated '|' multi-level lineage tracks. +""" import logging -import os import sys from pathlib import Path -from typing import Optional +from typing import Any, Optional import typer from krakenparser.utils import ensure_output_dir -_log = logging.getLogger(__name__) +# Initialize module-level isolated logger +_log: logging.Logger = logging.getLogger(__name__) -app = typer.Typer( +# Dedicated Typer routing application instantiation +app: typer.Typer = typer.Typer( name="mpa", add_completion=False, context_settings={"help_option_names": ["-h", "--help"]}, ) -_MAIN_LVLS = {"R", "K", "D", "P", "C", "O", "F", "G", "S"} +# Standard strict taxonomic reference limits +_MAIN_LVLS: set[str] = {"R", "K", "D", "P", "C", "O", "F", "G", "S"} def _parse_line(line: str, remove_spaces: bool = False) -> list: - """Parse one Kraken2 report line. + """Parse a single Kraken2 or KrakenUniq report row and extract vital stats. + + Handles standard kraken formats alongside KrakenUniq outputs by mapping text + labels to fixed-width single-character taxonomic rank designators. - Returns [name, level_num, level_type, all_reads, percents] - or empty list on malformed input. + Args: + line: A raw tab-delimited line from the report file. + remove_spaces: If True, internal spaces within organism nomenclature + strings are mapped securely to structural underscores. + + Returns: + list: A data list containing [cleaned_name, level_num, level_type, total_reads, relative_percentage] + or an empty list [] if the line format violates parser syntactic assumptions. """ - parts = line.rstrip("\n").split("\t") + parts: list[str] = line.rstrip("\n").split("\t") if len(parts) < 4: return [] - try: - int(parts[1]) - except ValueError: - return [] try: - percents = float(parts[0]) + percents: float = float(parts[0]) + all_reads: int = int(parts[1]) except ValueError: return [] - all_reads = int(parts[1]) + # Detect and handle alternative KrakenUniq columns format if applicable try: int(parts[-3]) - level_type = parts[-2].strip() - map_kuniq = { + level_type: str = parts[-2].strip() + map_kuniq: dict[str, str] = { "species": "S", "genus": "G", "family": "F", @@ -55,25 +68,20 @@ def _parse_line(line: str, remove_spaces: bool = False) -> list: "superkingdom": "D", "kingdom": "K", } - if level_type not in map_kuniq: - level_type = "-" - else: - level_type = map_kuniq[level_type] + level_type = map_kuniq.get(level_type, "-") except ValueError: level_type = parts[-3].strip() - name = parts[-1] - spaces = 0 - for ch in name: - if ch == " ": - spaces += 1 - else: - break - name = name.strip() + raw_name: str = parts[-1] + + # High-performance calculation of leading double-space indentation metrics + spaces: int = len(raw_name) - len(raw_name.lstrip(" ")) + name: str = raw_name.strip() + if remove_spaces: name = name.replace(" ", "_") - level_num = spaces / 2 + level_num: float = spaces / 2 return [name, level_num, level_type, all_reads, percents] @@ -85,28 +93,49 @@ def kreport_to_mpa( use_reads: bool = True, remove_spaces: bool = True, ) -> None: - """Convert a single Kraken2 report to MPA format.""" + """Transform an individual Kraken2 report matrix file into an MPA lineage table. + + Iterates over lines sequentially, dynamically collapsing or expanding an internal + lineage stack buffer when tracking changes in indentation depths. + + Args: + report_path: Path to the validated incoming raw text file. + output_path: Path where the converted tracking table will be dumped. + display_header: If True, writes a header indicating source provenance metadata. + include_intermediate: If True, non-standard ranks are preserved under 'x__' tags. + use_reads: If True, maps absolute counts. If False, streams relative percentage scores. + remove_spaces: If True, replaces standard word spaces inside strings with underscores. + + Raises: + FileNotFoundError: Triggered if the target source input file is not found. + """ if not report_path.is_file(): raise FileNotFoundError(f"Input file not found: {report_path}") - out_path = ensure_output_dir(str(output_path), is_file=True) + + out_path: Path = ensure_output_dir(output_path, is_file=True) curr_path: list[str] = [] - prev_lvl_num = -1 + prev_lvl_num: float = -1.0 - with open(report_path) as r_fh, open(out_path, "w") as o_fh: + with ( + open(report_path, encoding="utf-8") as r_fh, + open(out_path, "w", encoding="utf-8") as o_fh, + ): if display_header: - o_fh.write("#Classification\t" + os.path.basename(report_path) + "\n") + o_fh.write(f"#Classification\t{report_path.name}\n") for line in r_fh: report_vals = _parse_line(line, remove_spaces) - if len(report_vals) < 5: + if report_vals is None: continue name, level_num, level_type, all_reads, percents = report_vals + # Safely drop unclassified sequencing categories ('U') if level_type == "U": continue + # Standardize non-canonical levels to match MetaPhlAn structural styles if level_type not in _MAIN_LVLS: level_type = "x" elif level_type == "K": @@ -114,26 +143,29 @@ def kreport_to_mpa( elif level_type == "D": level_type = "d" - level_str = level_type.lower() + "__" + name + level_str: str = f"{level_type.lower()}__{name}" - if prev_lvl_num == -1: + # Setup baseline root node conditions + if prev_lvl_num == -1.0: prev_lvl_num = level_num curr_path.append(level_str) continue - while level_num != (prev_lvl_num + 1): - prev_lvl_num -= 1 + # Step out of current lineage stack frames if depth levels step backward + while level_num != (prev_lvl_num + 1.0): + prev_lvl_num -= 1.0 curr_path.pop() + # Conditionally pipe clean taxonomy paths down to file IO streams if (level_type == "x" and include_intermediate) or level_type != "x": - ancestors = [ + ancestors: list[str] = [ seg for seg in curr_path if (seg[0] != "x" or include_intermediate) and seg[0] != "r" ] - path = "|".join(ancestors + [level_str]) - value = str(all_reads) if use_reads else str(percents) - o_fh.write(path + "\t" + value + "\n") + path: str = "|".join(ancestors + [level_str]) + value: str = str(all_reads) if use_reads else str(percents) + o_fh.write(f"{path}\t{value}\n") curr_path.append(level_str) prev_lvl_num = level_num @@ -207,10 +239,10 @@ def main( ) raise typer.Exit(code=1) - use_reads = not percentages - remove_spaces = not keep_spaces + use_reads: bool = not percentages + remove_spaces: bool = not keep_spaces - kwargs = dict( + kwargs: dict[str, Any] = dict( display_header=display_header, include_intermediate=intermediate_ranks, use_reads=use_reads, @@ -227,10 +259,13 @@ def main( for f in sorted(input_dir.iterdir()): if not f.is_file(): continue - out_name = f.name.replace(".kreport", ".MPA.TXT") + out_name: str = f.name.replace(".kreport", ".MPA.TXT") kreport_to_mpa(f, o_file / out_name, **kwargs) _log.info("Converted to MPA successfully. Output stored in %s", o_file) else: + assert r_file is not None, ( + "Internal error: r_file is missing in singleton mode." + ) kreport_to_mpa(r_file, o_file, **kwargs) except FileNotFoundError as e: From ce218ae4ede85d746ac3ed0cc0871cd26d64c3c1 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Thu, 4 Jun 2026 09:48:06 +0200 Subject: [PATCH 15/25] refactor(stats): implement strict type hints, logging, and docstrings in diversity.py --- krakenparser/stats/diversity.py | 184 +++++++++++++++++++++++--------- 1 file changed, 136 insertions(+), 48 deletions(-) diff --git a/krakenparser/stats/diversity.py b/krakenparser/stats/diversity.py index ef276d9..e9dfb29 100644 --- a/krakenparser/stats/diversity.py +++ b/krakenparser/stats/diversity.py @@ -1,9 +1,15 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 +"""Statistical module for calculating microbial community alpha and beta diversities. + +This module provides industry-standard ecological metrics including Shannon Index, +Pielou's Evenness, and Chao1 Richness for alpha diversity, alongside rarefaction-backed +Bray-Curtis and Jaccard distance metrics for beta diversity analysis. +""" import logging import sys from pathlib import Path -from typing import Optional +from typing import Any, Optional, Sequence import numpy as np import pandas as pd @@ -12,54 +18,117 @@ from krakenparser.utils import ensure_output_dir -_log = logging.getLogger(__name__) +# Initialize module-level isolated logger +_log: logging.Logger = logging.getLogger(__name__) -app = typer.Typer( +# Dedicated Typer routing application instantiation +app: typer.Typer = typer.Typer( name="diversity", add_completion=False, context_settings={"help_option_names": ["-h", "--help"]}, ) -def shannon_index(counts): - counts = np.array(counts) - counts = counts[counts > 0] - proportions = counts / counts.sum() - return -np.sum(proportions * np.log(proportions)) +def shannon_index(counts: np.ndarray | Sequence[float] | Sequence[int]) -> float: + """Calculate the Shannon-Wiener diversity index (H') for a count vector. + + The index is computed using the formula: H' = -sum(p_i * ln(p_i)), + where p_i is the relative abundance of each present taxon. + + Args: + counts: A sequence or array of absolute taxonomic abundance counts. + + Returns: + float: The calculated Shannon diversity index. + """ + counts_arr: np.ndarray = np.array(counts) + nonzero_counts: np.ndarray = counts_arr[counts_arr > 0] + + if nonzero_counts.size == 0: + return 0.0 + + proportions: np.ndarray = nonzero_counts / nonzero_counts.sum() + return float(-np.sum(proportions * np.log(proportions))) + + +def pielou_evenness(counts: np.ndarray | Sequence[float] | Sequence[int]) -> float: + """Calculate Pielou's Evenness index (J') for a count vector. + + The index is computed as: J' = H' / ln(S), where H' is the Shannon index + and S is the total number of observed species (richness). + + Args: + counts: A sequence or array of absolute taxonomic abundance counts. + Returns: + float: Pielou's evenness value, or np.nan if species richness <= 1. + """ + counts_arr: np.ndarray = np.asarray(counts) + species_richness: int = int(np.sum(counts_arr > 0)) -def pielou_evenness(counts): - counts = np.asarray(counts) - S = int(np.sum(counts > 0)) - if S <= 1: - return np.nan - return shannon_index(counts) / np.log(S) + if species_richness <= 1: + return float(np.nan) + return shannon_index(counts_arr) / float(np.log(species_richness)) -def chao1_index(counts): - counts = np.array(counts) - S_obs = np.sum(counts > 0) - F1 = np.sum(counts == 1) - F2 = np.sum(counts == 2) - if F2 == 0: - return S_obs + F1 * (F1 - 1) / 2 - return S_obs + (F1 * F1) / (2 * F2) + +def chao1_index(counts: np.ndarray | Sequence[float] | Sequence[int]) -> float: + """Calculate the Chao1 non-parametric richness estimator for a community. + + Accounts for rare unsampled species based on singletons (F1) and doubletons (F2). + Formula: S_chao1 = S_obs + (F1 * (F1 - 1)) / (2 * (F2 + 1)) if F2 == 0 + else S_obs + (F1^2) / (2 * F2). + + Args: + counts: A sequence or array of absolute taxonomic abundance counts. + + Returns: + float: The estimated total species richness. + """ + counts_arr: np.ndarray = np.array(counts) + species_observed: int = int(np.sum(counts_arr > 0)) + singletons: int = int(np.sum(counts_arr == 1)) + doubletons: int = int(np.sum(counts_arr == 2)) + + if doubletons == 0: + return float(species_observed + singletons * (singletons - 1) / 2) + + return float(species_observed + (singletons * singletons) / (2 * doubletons)) def _subsample_counts( counts: np.ndarray, n: int, rng: np.random.Generator ) -> np.ndarray: - """Rarefy counts to n reads by sampling without replacement.""" - indices = np.repeat(np.arange(len(counts)), counts) - sampled = rng.choice(indices, size=n, replace=False) + """Rarefy absolute abundance counts to a uniform depth without replacement. + + Args: + counts: Array of absolute integers representing community abundances. + n: Targeted sequencing read subsampling depth (rarefaction size). + rng: An instantiated NumPy random generator state. + + Returns: + np.ndarray: A new rarefied absolute abundance vector matching the source shape. + """ + indices: np.ndarray = np.repeat(np.arange(len(counts)), counts) + sampled: np.ndarray = rng.choice(indices, size=n, replace=False) return np.bincount(sampled, minlength=len(counts)).astype(int) def calc_alpha_div(df: pd.DataFrame, output_path: Path) -> None: - out_path = ensure_output_dir(str(output_path), is_file=False) - results = [] + """Compute alpha diversity vectors across all profiles within a count matrix. + + Generates a structured CSV data table containing Shannon, Pielou, and Chao1 + indices mapped natively to individual sample identifiers. + + Args: + df: Input DataFrame where indices represent samples and columns indicate taxa. + output_path: Target directory Path where results are exported. + """ + out_path: Path = ensure_output_dir(output_path, is_file=False) + results: list[dict[str, Any]] = [] + for sample_id, row in df.iterrows(): - counts = row.values + counts: np.ndarray = row.values results.append( { "Sample": sample_id, @@ -68,11 +137,12 @@ def calc_alpha_div(df: pd.DataFrame, output_path: Path) -> None: "Chao1": chao1_index(counts), } ) - alpha_df = pd.DataFrame(results).set_index("Sample") + + alpha_df: pd.DataFrame = pd.DataFrame(results).set_index("Sample") alpha_df.to_csv(out_path / "alpha_div.csv") _log.info( - f"α-diversity has been successfully calculated and saved to '{output_path}'." + "α-diversity has been successfully calculated and saved to '%s'.", output_path ) @@ -82,40 +152,58 @@ def calc_beta_div( rarefaction_depth: int, seed: Optional[int] = None, ) -> None: - out_path = ensure_output_dir(str(output_path), is_file=False) - rng = np.random.default_rng(seed) + """Compute composition dissimilarity matrices utilizing uniform rarefied values. + + Applies absolute read-filtering limits, performs non-replacement subsampling, + and scales community metrics via Bray-Curtis and Jaccard distance calculators. + + Args: + df: Input DataFrame where indices represent samples and columns indicate taxa. + output_path: Target directory Path where results are exported. + rarefaction_depth: Integer specifying the strict depth threshold for subsampling. + seed: Random state state-initializer utilized to force deterministic rarefaction. + + Raises: + ValueError: Triggered if less than two samples fulfill the minimum rarefaction depth. + """ + out_path: Path = ensure_output_dir(output_path, is_file=False) + rng: np.random.Generator = np.random.default_rng(seed) rarefied_counts: list[np.ndarray] = [] sample_ids: list[str] = [] + # Filter cohorts and compress vectors to secure computational scaling equity for sample, row in df.iterrows(): - counts = np.round(row.values).astype(int) + counts: np.ndarray = np.round(row.values).astype(int) if counts.sum() >= rarefaction_depth: - rarefied = _subsample_counts(counts, n=rarefaction_depth, rng=rng) + rarefied: np.ndarray = _subsample_counts( + counts, n=rarefaction_depth, rng=rng + ) rarefied_counts.append(rarefied) sample_ids.append(str(sample)) if len(rarefied_counts) < 2: raise ValueError("Not enough samples passed the rarefaction threshold.") - X = np.array(rarefied_counts, dtype=float) - idx = pd.Index(sample_ids) + matrix_x: np.ndarray = np.array(rarefied_counts, dtype=float) + index_labels: pd.Index = pd.Index(sample_ids) - bray_df = pd.DataFrame( - squareform(pdist(X, metric="braycurtis")), - index=idx, - columns=idx, + # Calculate spatial ecological distance metrics matrices + bray_df: pd.DataFrame = pd.DataFrame( + squareform(pdist(matrix_x, metric="braycurtis")), + index=index_labels, + columns=index_labels, ) - jaccard_df = pd.DataFrame( - squareform(pdist(X.astype(bool).astype(float), metric="jaccard")), - index=idx, - columns=idx, + jaccard_df: pd.DataFrame = pd.DataFrame( + squareform(pdist(matrix_x.astype(bool).astype(float), metric="jaccard")), + index=index_labels, + columns=index_labels, ) bray_df.to_csv(out_path / "beta_div_bray.csv") jaccard_df.to_csv(out_path / "beta_div_jaccard.csv") _log.info( - f"β-diversity has been successfully calculated and saved to '{output_path}'." + "β-diversity has been successfully calculated and saved to '%s'.", output_path ) @@ -161,7 +249,7 @@ def main( ) raise typer.Exit(code=1) - seed_label = ( + seed_label: str = ( str(seed) if seed is not None else "not set (results will vary between runs)" ) _log.info("Rarefaction depth: %d | seed: %s", depth, seed_label) @@ -171,7 +259,7 @@ def main( raise typer.Exit(code=1) output_dir.mkdir(parents=True, exist_ok=True) - df = pd.read_csv(input_file, index_col=0) + df: pd.DataFrame = pd.read_csv(input_file, index_col=0) try: calc_alpha_div(df, output_dir) From 1116b048843f255f2629ac5130a3e1f6e84d81f8 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Thu, 4 Jun 2026 09:48:55 +0200 Subject: [PATCH 16/25] refactor(stats): implement vectorization, explicit type hints, and docstrings in relabund.py --- krakenparser/stats/relabund.py | 86 +++++++++++++++++++++++----------- 1 file changed, 58 insertions(+), 28 deletions(-) diff --git a/krakenparser/stats/relabund.py b/krakenparser/stats/relabund.py index 9439f44..60e945c 100644 --- a/krakenparser/stats/relabund.py +++ b/krakenparser/stats/relabund.py @@ -1,19 +1,29 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 +"""Normalization module for calculating relative abundances of microbial taxa. + +This module reshapes wide count matrices into tidy long-format tables, converts +raw read counts into percentage distributions per sample, filters out zero-abundance +observations, and optionally aggregates rare background taxa under a unified +customizable threshold to prevent downstream overplotting. +""" import logging import sys import warnings from pathlib import Path -from typing import Optional +from typing import Any, Optional +import numpy as np import pandas as pd import typer from krakenparser.utils import ensure_output_dir -_log = logging.getLogger(__name__) +# Initialize module-level isolated logger +_log: logging.Logger = logging.getLogger(__name__) -app = typer.Typer( +# Dedicated Typer routing application instantiation +app: typer.Typer = typer.Typer( name="relabund", add_completion=False, context_settings={"help_option_names": ["-h", "--help"]}, @@ -23,22 +33,42 @@ def calculate_rel_abund( input_file: Path, output_file: Path, other_threshold: Optional[float] = None ) -> None: - in_path = Path(input_file) - if not in_path.is_file(): - raise FileNotFoundError(f"Input file not found: {in_path}") - out_path = ensure_output_dir(str(output_file), is_file=True) + """Transform absolute taxonomic counts to relative percentage profiles. + + Reshapes data into long format, detects and warns about zero-abundance samples, + normalizes counts to a 100% scale, and applies an efficient vector-based + threshold filter to bundle low-abundance variants into an 'Other' abstraction. + + Args: + input_file: Path to the incoming wide matrix CSV (index or Sample_id required). + output_file: Target path where the final long-format normalized CSV is saved. + other_threshold: Optional percentage bound (e.g., 3.5). Taxa falling below + this value within a sample are aggregated into a generic composite pool. - # Load counts table - df = pd.read_csv(in_path) + Raises: + FileNotFoundError: Triggered if the specified input count resource is missing. + """ + if not input_file.is_file(): + raise FileNotFoundError(f"Input file not found: {input_file}") - # Reshape to long format: Sample_id, taxon, abundance - long_df = df.melt(id_vars=["Sample_id"], var_name="taxon", value_name="abundance") + out_path: Path = ensure_output_dir(output_file, is_file=True) - # Summarize total abundance per sample (used for percentage calculation) - total_abundance = long_df.groupby("Sample_id")["abundance"].transform("sum") + # Load counts table matrix (Wide format: rows=samples, cols=taxa) + df: pd.DataFrame = pd.read_csv(input_file) + + # Reshape to long format: Sample_id, taxon, abundance (Tidy data specification) + long_df: pd.DataFrame = df.melt( + id_vars=["Sample_id"], var_name="taxon", value_name="abundance" + ) - zero_samples = long_df.groupby("Sample_id")["abundance"].sum() - zero_samples = zero_samples[zero_samples == 0].index.tolist() + # Vectorized total abundance extraction per mapping target profile + total_abundance: pd.Series = long_df.groupby("Sample_id")["abundance"].transform( + "sum" + ) + + # Isolate and audit unsequenced or empty background sample profiles + sample_sums: pd.Series = long_df.groupby("Sample_id")["abundance"].sum() + zero_samples: list[Any] = sample_sums[sample_sums == 0].index.tolist() if zero_samples: warnings.warn( f"Samples with zero total abundance were excluded from output: {zero_samples}", @@ -46,29 +76,29 @@ def calculate_rel_abund( stacklevel=2, ) - # Calculate relative abundance (%) + # Compute relative composition metric percentage arrays long_df["rel_abund_perc"] = (long_df["abundance"] / total_abundance) * 100 - # Drop 0.0 rows + # Clean runtime noise by purging absolute zero occurrences long_df = long_df[long_df["rel_abund_perc"] > 0.0] - # Apply "Other" grouping if threshold is specified + # Conditionally execute low-abundance grouping utilizing high-performance numpy mapping if other_threshold is not None: - threshold = float(other_threshold) - label = f"Other (<{threshold}%)" - long_df["taxon"] = long_df.apply( - lambda row: label if row["rel_abund_perc"] < threshold else row["taxon"], - axis=1, - ) + threshold: float = float(other_threshold) + label: str = f"Other (<{threshold}%)" + + # High-performance Vectorized assignment replacing legacy row-wise df.apply loop + threshold_mask: pd.Series = long_df["rel_abund_perc"] < threshold + long_df["taxon"] = np.where(threshold_mask, label, long_df["taxon"]) - # Summarize final percentages - result = ( + # Aggregate final percentage statistics collapsed under composite groups if applied + result: pd.DataFrame = ( long_df.groupby(["Sample_id", "taxon"], as_index=False)["rel_abund_perc"] .sum() .sort_values(["Sample_id", "rel_abund_perc"], ascending=[True, False]) ) - # Save to CSV + # Flush metrics out to structured storage layout result.to_csv(out_path, index=False) _log.info("Relative abundance saved as '%s'.", output_file) From a1a29e42180589efa13f5acb57e5026002f1ebce Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Thu, 4 Jun 2026 09:50:44 +0200 Subject: [PATCH 17/25] refactor(tests): document fixture architecture and add mock configurations to conftest.py --- tests/conftest.py | 52 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/tests/conftest.py b/tests/conftest.py index f681d4f..e494811 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,9 +1,26 @@ +"""Shared pytest fixtures and sample data for the krakenparser test suite. + +Fixture hierarchy +----------------- +conftest.py ← file objects, DataFrames, CLI runner (all tests) +test_units.py ← pure-function tests, no I/O +test_cli.py ← Typer CliRunner smoke / error-path tests +test_integration.py ← library-function I/O + reproducibility tests +test_kpplot.py ← plotting smoke + parameter-validation tests +test_full_pipeline.py← end-to-end pipeline tests (requires demo_data.zip) +""" + import matplotlib matplotlib.use("Agg") import pandas as pd import pytest +from typer.testing import CliRunner + +# --------------------------------------------------------------------------- +# Raw sample data — module-level constants shared across test files +# --------------------------------------------------------------------------- SAMPLE_KREPORT = ( "99.98\t999980\t0\tR\t1\troot\n" @@ -26,9 +43,19 @@ "Bacteroides fragilis\t100000\t200000\n" ) +# MPA-format single-sample files used across CLI and integration tests +SAMPLE_MPA_A = "#Classification\tsample1\nd__Bacteria|s__Pseudomonas_aeruginosa\t300\n" +SAMPLE_MPA_B = "#Classification\tsample2\nd__Bacteria|s__Pseudomonas_aeruginosa\t100\n" + + +# --------------------------------------------------------------------------- +# File-based fixtures +# --------------------------------------------------------------------------- + @pytest.fixture def kreport_file(tmp_path): + """A single valid Kraken2 report file covering common ranks.""" f = tmp_path / "sample.kreport" f.write_text(SAMPLE_KREPORT) return f @@ -36,6 +63,7 @@ def kreport_file(tmp_path): @pytest.fixture def counts_txt_file(tmp_path): + """Tab-delimited counts file as produced by processing_script.py.""" f = tmp_path / "counts_species.txt" f.write_text(SAMPLE_COUNTS_TXT) return f @@ -43,6 +71,7 @@ def counts_txt_file(tmp_path): @pytest.fixture def counts_csv_file(tmp_path): + """Wide-format CSV with Sample_id index column and per-taxon count columns.""" df = pd.DataFrame( { "Sample_id": ["S1", "S2"], @@ -56,8 +85,14 @@ def counts_csv_file(tmp_path): return f +# --------------------------------------------------------------------------- +# DataFrame fixtures +# --------------------------------------------------------------------------- + + @pytest.fixture def relabund_df(): + """Long-format relative-abundance DataFrame with two samples and three taxa.""" return pd.DataFrame( { "Sample_id": ["S1", "S1", "S1", "S2", "S2", "S2"], @@ -72,3 +107,20 @@ def relabund_df(): "rel_abund_perc": [70.0, 20.0, 10.0, 50.0, 35.0, 15.0], } ) + + +@pytest.fixture +def two_sample_metadata(): + """Minimal metadata DataFrame mapping S1→TypeA and S2→TypeB.""" + return pd.DataFrame({"Sample_id": ["S1", "S2"], "Group": ["TypeA", "TypeB"]}) + + +# --------------------------------------------------------------------------- +# CLI runner +# --------------------------------------------------------------------------- + + +@pytest.fixture +def runner(): + """Typer CliRunner instance, shared by all CLI smoke tests.""" + return CliRunner() From a639f503be693553f23c99434fb665fe7d014635 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Thu, 4 Jun 2026 09:51:57 +0200 Subject: [PATCH 18/25] refactor(tests): parameterize CLI error paths and clean up entrypoint testing in test_cli.py --- tests/test_cli.py | 230 ++++++++++++++++++---------------------------- 1 file changed, 89 insertions(+), 141 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index f3def0e..687207e 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,10 +1,19 @@ -"""Smoke tests for CLI entry-points via Typer CliRunner.""" +"""Smoke tests for CLI entry-points via Typer CliRunner. + +Each command is exercised for three standard error paths: + 1. No arguments → usage help (exit 0). + 2. Missing one required option → validation error (exit 1). + 3. Non-existent input file → runtime error (exit 1). + +Happy paths for commands that produce file output are covered separately in +test_integration.py and test_full_pipeline.py. +""" import shutil import pandas as pd import pytest -from typer.testing import CliRunner +from conftest import SAMPLE_MPA_A, SAMPLE_MPA_B from krakenparser.counts.convert2csv import app as convert2csv_app from krakenparser.counts.processing_script import app as processing_app @@ -15,9 +24,6 @@ from krakenparser.stats.diversity import app as diversity_app from krakenparser.stats.relabund import app as relabund_app -_MPA_A = "#Classification\tsample1\nd__Bacteria|s__Pseudomonas_aeruginosa\t300\n" -_MPA_B = "#Classification\tsample2\nd__Bacteria|s__Pseudomonas_aeruginosa\t100\n" - _COMBINED_MPA = ( "#Classification\tsample1\tsample2\n" "d__Bacteria|p__Pseudomonadota|g__Pseudomonas|s__Pseudomonas_aeruginosa\t300\t100\n" @@ -25,118 +31,102 @@ ) -@pytest.fixture -def runner(): - return CliRunner() - - # --------------------------------------------------------------------------- -# convert2csv +# Parametrized standard error-path suite # --------------------------------------------------------------------------- - -def test_convert2csv_no_args_shows_help(runner): - result = runner.invoke(convert2csv_app, []) - assert result.exit_code == 0 - assert "Usage" in result.output - - -def test_convert2csv_missing_one_option(runner, tmp_path): - result = runner.invoke(convert2csv_app, ["-i", str(tmp_path / "x.txt")]) - assert result.exit_code == 1 - assert "Missing required options" in result.output - - -def test_convert2csv_file_not_found(runner, tmp_path): - result = runner.invoke( +# Each tuple: (app, no-arg exit code, partial-args list, file-not-found args factory) +# The factory is a callable(tmp_path) → list[str]. +_CLI_SPECS = [ + ( convert2csv_app, - ["-i", str(tmp_path / "ghost.txt"), "-o", str(tmp_path / "out.csv")], - ) - assert result.exit_code == 1 - assert "Error" in result.output - - -# --------------------------------------------------------------------------- -# processing_script -# --------------------------------------------------------------------------- - - -def test_processing_no_args_shows_help(runner): - result = runner.invoke(processing_app, []) - assert result.exit_code == 0 - assert "Usage" in result.output - - -def test_processing_missing_one_option(runner, tmp_path): - result = runner.invoke(processing_app, ["-i", str(tmp_path / "x.txt")]) - assert result.exit_code == 1 - assert "Missing required options" in result.output - - -def test_processing_file_not_found(runner, tmp_path): - result = runner.invoke( + ["-i", "{{ghost}}"], + ["-i", "{{ghost}}", "-o", "{{out}}"], + ), + ( processing_app, - ["-i", str(tmp_path / "ghost.txt"), "-o", str(tmp_path / "dest.txt")], - ) - assert result.exit_code == 1 - assert "Error" in result.output + ["-i", "{{ghost}}"], + ["-i", "{{ghost}}", "-o", "{{dest}}"], + ), + ( + split_mpa_app, + ["-i", "{{ghost}}"], + ["-i", "{{ghost}}", "-o", "{{out}}"], + ), + ( + diversity_app, + ["-o", "{{out}}"], + ["-i", "{{ghost}}", "-o", "{{out}}"], + ), + ( + relabund_app, + ["-i", "{{ghost}}"], + ["-i", "{{ghost}}", "-o", "{{out}}"], + ), +] -# --------------------------------------------------------------------------- -# split_mpa -# --------------------------------------------------------------------------- +def _resolve(args: list[str], tmp_path) -> list[str]: + """Substitute placeholder tokens with concrete tmp_path paths.""" + mapping = { + "{{ghost}}": str(tmp_path / "ghost.txt"), + "{{out}}": str(tmp_path / "out.csv"), + "{{dest}}": str(tmp_path / "dest.txt"), + } + return [mapping.get(a, a) for a in args] -def test_split_mpa_no_args_shows_help(runner): - result = runner.invoke(split_mpa_app, []) +@pytest.mark.parametrize("app,_,__", _CLI_SPECS) +def test_no_args_shows_help(app, _, __, runner): + result = runner.invoke(app, []) assert result.exit_code == 0 assert "Usage" in result.output -def test_split_mpa_missing_one_option(runner, tmp_path): - result = runner.invoke(split_mpa_app, ["-i", str(tmp_path / "x.txt")]) +@pytest.mark.parametrize("app,partial_args,__", _CLI_SPECS) +def test_missing_required_option(app, partial_args, __, runner, tmp_path): + result = runner.invoke(app, _resolve(partial_args, tmp_path)) assert result.exit_code == 1 assert "Missing required options" in result.output -def test_split_mpa_file_not_found(runner, tmp_path): - result = runner.invoke( - split_mpa_app, - ["-i", str(tmp_path / "ghost.txt"), "-o", str(tmp_path / "out")], - ) +@pytest.mark.parametrize("app,_,fnf_args", _CLI_SPECS) +def test_file_not_found(app, _, fnf_args, runner, tmp_path): + result = runner.invoke(app, _resolve(fnf_args, tmp_path)) assert result.exit_code == 1 assert "Error" in result.output # --------------------------------------------------------------------------- -# mpa_table +# mpa_table — happy path (two valid input files → combined output) # --------------------------------------------------------------------------- -def test_mpa_table_main(tmp_path, runner): +def test_mpa_table_combines_two_files(runner, tmp_path): a, b = tmp_path / "a.MPA.TXT", tmp_path / "b.MPA.TXT" - a.write_text(_MPA_A) - b.write_text(_MPA_B) + a.write_text(SAMPLE_MPA_A) + b.write_text(SAMPLE_MPA_B) out = tmp_path / "COMBINED.txt" result = runner.invoke(mpa_table_app, ["-i", str(a), "-i", str(b), "-o", str(out)]) + assert result.exit_code == 0 assert out.exists() # --------------------------------------------------------------------------- -# transform2mpa +# transform2mpa — single file and batch directory modes # --------------------------------------------------------------------------- -def test_transform2mpa_main_single(kreport_file, tmp_path, runner): +def test_transform2mpa_single_file(kreport_file, runner, tmp_path): out = tmp_path / "out.MPA.TXT" result = runner.invoke(transform2mpa_app, ["-r", str(kreport_file), "-o", str(out)]) assert result.exit_code == 0 assert out.exists() -def test_transform2mpa_main_batch(kreport_file, tmp_path, runner): +def test_transform2mpa_batch_directory(kreport_file, runner, tmp_path): kreports_dir = tmp_path / "kreports" kreports_dir.mkdir() shutil.copy(kreport_file, kreports_dir / kreport_file.name) @@ -145,100 +135,58 @@ def test_transform2mpa_main_batch(kreport_file, tmp_path, runner): result = runner.invoke( transform2mpa_app, ["-i", str(kreports_dir), "-o", str(out_dir)] ) + assert result.exit_code == 0 assert out_dir.is_dir() # --------------------------------------------------------------------------- -# diversity +# pipeline CLI — error paths (success path covered by test_full_pipeline.py) # --------------------------------------------------------------------------- -def test_diversity_no_args_shows_help(runner): - result = runner.invoke(diversity_app, []) - assert result.exit_code == 0 - assert "Usage" in result.output - - -def test_diversity_missing_one_option(runner, tmp_path): - result = runner.invoke(diversity_app, ["-o", str(tmp_path / "out")]) - assert result.exit_code == 1 - assert "Missing required options" in result.output - - -def test_diversity_file_not_found(runner, tmp_path): - result = runner.invoke( - diversity_app, - ["-i", str(tmp_path / "ghost.csv"), "-o", str(tmp_path / "out")], - ) +def test_pipeline_empty_kreports_dir_raises(runner, tmp_path): + empty_dir = tmp_path / "kreports" + empty_dir.mkdir() + result = runner.invoke(pipeline_app, ["-i", str(empty_dir)]) assert result.exit_code == 1 assert "Error" in result.output -def test_diversity_not_enough_samples_for_beta(runner, tmp_path): - csv_in = tmp_path / "single.csv" - pd.DataFrame({"Taxon_A": [100], "Taxon_B": [200]}, index=["S1"]).to_csv(csv_in) - out_dir = tmp_path / "div" - result = runner.invoke( - diversity_app, - ["-i", str(csv_in), "-o", str(out_dir), "-d", "50"], - ) +def test_pipeline_missing_input_dir_raises(runner, tmp_path): + result = runner.invoke(pipeline_app, ["-i", str(tmp_path / "ghost")]) assert result.exit_code == 1 assert "Error" in result.output -# --------------------------------------------------------------------------- -# relabund -# --------------------------------------------------------------------------- - - -def test_relabund_no_args_shows_help(runner): - result = runner.invoke(relabund_app, []) - assert result.exit_code == 0 - assert "Usage" in result.output +def test_pipeline_existing_output_without_overwrite_raises( + runner, tmp_path, kreport_file +): + kreports_dir = tmp_path / "kreports" + kreports_dir.mkdir() + shutil.copy(kreport_file, kreports_dir / kreport_file.name) + runner.invoke(pipeline_app, ["-i", str(kreports_dir), "--overwrite"]) + result = runner.invoke(pipeline_app, ["-i", str(kreports_dir)]) -def test_relabund_missing_one_option(runner, tmp_path): - result = runner.invoke(relabund_app, ["-i", str(tmp_path / "x.csv")]) - assert result.exit_code == 1 - assert "Missing required options" in result.output - - -def test_relabund_file_not_found(runner, tmp_path): - result = runner.invoke( - relabund_app, - ["-i", str(tmp_path / "ghost.csv"), "-o", str(tmp_path / "out.csv")], - ) assert result.exit_code == 1 assert "Error" in result.output # --------------------------------------------------------------------------- -# pipeline (error paths only — success path covered by test_full_pipeline.py) +# diversity — domain-specific validation # --------------------------------------------------------------------------- -def test_pipeline_no_mpa_files(runner, tmp_path): - empty_dir = tmp_path / "kreports" - empty_dir.mkdir() - result = runner.invoke(pipeline_app, ["-i", str(empty_dir)]) - assert result.exit_code == 1 - assert "Error" in result.output - - -def test_pipeline_file_exists_error(runner, tmp_path, kreport_file): - kreports_dir = tmp_path / "kreports" - kreports_dir.mkdir() - shutil.copy(kreport_file, kreports_dir / kreport_file.name) - - runner.invoke(pipeline_app, ["-i", str(kreports_dir), "--overwrite"]) - - result = runner.invoke(pipeline_app, ["-i", str(kreports_dir)]) - assert result.exit_code == 1 - assert "Error" in result.output +def test_diversity_not_enough_samples_for_beta(runner, tmp_path): + csv_in = tmp_path / "single.csv" + pd.DataFrame({"Taxon_A": [100], "Taxon_B": [200]}, index=["S1"]).to_csv(csv_in) # ty:ignore[invalid-argument-type] + out_dir = tmp_path / "div" + result = runner.invoke( + diversity_app, + ["-i", str(csv_in), "-o", str(out_dir), "-d", "50"], + ) -def test_pipeline_missing_input_dir(runner, tmp_path): - result = runner.invoke(pipeline_app, ["-i", str(tmp_path / "ghost")]) assert result.exit_code == 1 assert "Error" in result.output From 4e42422e5c4823ce2127e81020eecf64a148f0d9 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Thu, 4 Jun 2026 09:52:48 +0200 Subject: [PATCH 19/25] refactor(tests): clarify end-to-end execution scope and clean up pipeline tests in test_full_pipeline.py --- tests/test_full_pipeline.py | 63 ++++++++++++++++++++++++------------- 1 file changed, 41 insertions(+), 22 deletions(-) diff --git a/tests/test_full_pipeline.py b/tests/test_full_pipeline.py index ad2dba6..07fed16 100644 --- a/tests/test_full_pipeline.py +++ b/tests/test_full_pipeline.py @@ -1,3 +1,14 @@ +"""End-to-end pipeline tests using real demo data. + +These tests exercise the full ``run_pipeline`` call stack — kreport → MPA → +counts CSV → relative abundance → diversity — and are skipped automatically +when ``demo_data.zip`` is absent from the repository root. + +Execution time is dominated by I/O and rarefaction; they are intentionally +kept out of the default fast-test run and should be executed in CI via a +dedicated ``pytest -m integration`` marker (or equivalent). +""" + import shutil import zipfile from pathlib import Path @@ -6,13 +17,22 @@ from krakenparser.pipeline import run_pipeline +# --------------------------------------------------------------------------- +# Fixture +# --------------------------------------------------------------------------- + @pytest.fixture def demo_run(tmp_path): + """Unpack demo_data.zip into a fresh tmp directory and return the run dir. + + Skips the test session if the zip is not present so that the suite can + still pass in environments that only have unit/integration data. + """ repo_root = Path(__file__).parent.parent.resolve() zip_src = repo_root / "demo_data.zip" if not zip_src.exists(): - pytest.skip("demo_data.zip not found") + pytest.skip("demo_data.zip not found — skipping end-to-end tests") local_zip = tmp_path / "demo_data.zip" shutil.copy(zip_src, local_zip) @@ -27,49 +47,48 @@ def demo_run(tmp_path): return {"run_dir": run_dir} -def test_full_pipeline_end_to_end(demo_run): +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +def test_full_pipeline_produces_all_expected_outputs(demo_run): run_dir = demo_run["run_dir"] kreports_path = run_dir / "kreports" run_pipeline(kreports_path) - # Assert each rank-level CSV exists and is non-empty - ranks = ["phylum", "class", "order", "family", "genus", "species"] - for rank in ranks: + # Per-rank count CSVs + for rank in ("phylum", "class", "order", "family", "genus", "species"): csv_path = run_dir / "counts" / f"counts_{rank}.csv" assert csv_path.exists(), f"Missing counts_{rank}.csv" assert csv_path.stat().st_size > 0, f"counts_{rank}.csv is empty" - # Assert relative-abundance CSVs exist and are non-empty + # Relative-abundance outputs rel_dir = run_dir / "rel_abund" assert rel_dir.exists(), "rel_abund directory is missing" - rel_species = rel_dir / "ra_species.csv" - assert rel_species.exists(), "Missing ra_species.csv" - assert rel_species.stat().st_size > 0, "ra_species.csv is empty" + ra_species = rel_dir / "ra_species.csv" + assert ra_species.exists(), "Missing ra_species.csv" + assert ra_species.stat().st_size > 0, "ra_species.csv is empty" - # Assert diversity outputs exist - diversity_dir = run_dir / "diversity" - assert (diversity_dir / "alpha_div.csv").exists() + # Diversity outputs + assert (run_dir / "diversity" / "alpha_div.csv").exists() - # Assert intermediate files exist + # Intermediate combined MPA assert (run_dir / "intermediate" / "COMBINED.txt").exists() -def test_pipeline_overwrite_protection(demo_run): - run_dir = demo_run["run_dir"] - kreports_path = run_dir / "kreports" +def test_pipeline_overwrite_protection_raises_on_second_run(demo_run): + kreports_path = demo_run["run_dir"] / "kreports" run_pipeline(kreports_path) - # Second run without --overwrite must raise (library function, not sys.exit) with pytest.raises(FileExistsError): run_pipeline(kreports_path) -def test_pipeline_overwrite_flag(demo_run): - run_dir = demo_run["run_dir"] - kreports_path = run_dir / "kreports" +def test_pipeline_overwrite_flag_allows_second_run(demo_run): + kreports_path = demo_run["run_dir"] / "kreports" run_pipeline(kreports_path) - # Second run with overwrite=True must succeed - run_pipeline(kreports_path, overwrite=True) + run_pipeline(kreports_path, overwrite=True) # must not raise From 1a011404f8113aeea48331cc834b4fad8f82bb27 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Thu, 4 Jun 2026 09:55:58 +0200 Subject: [PATCH 20/25] refactor(tests): structure integration endpoints and contract checks in test_integration.py --- tests/test_integration.py | 256 +++++++++++++++++++++----------------- 1 file changed, 145 insertions(+), 111 deletions(-) diff --git a/tests/test_integration.py b/tests/test_integration.py index 4dc9511..e60a783 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1,11 +1,22 @@ -"""Characterization / integration tests — file I/O, reproducibility.""" +"""Characterization and integration tests — file I/O, output contracts, reproducibility. + +Each section follows the same structure: + 1. Reproducibility — same input always produces bit-identical output. + 2. Output contract — schema, shape, and invariants of the output data. + 3. Error handling — FileNotFoundError and domain-specific ValueError cases. + +I/O helpers and file fixtures are defined in conftest.py. +Pure-function math is tested separately in test_units.py. +""" import hashlib import itertools from pathlib import Path +import numpy as np import pandas as pd import pytest +from conftest import SAMPLE_MPA_A, SAMPLE_MPA_B from krakenparser.counts.convert2csv import convert_to_csv from krakenparser.counts.processing_script import process_files @@ -16,7 +27,11 @@ from krakenparser.stats.diversity import calc_alpha_div, calc_beta_div from krakenparser.stats.relabund import calculate_rel_abund -SAMPLE_COMBINED_MPA = ( +# --------------------------------------------------------------------------- +# Multi-rank MPA fixture used only in this module +# --------------------------------------------------------------------------- + +_COMBINED_MPA_TEXT = ( "#Classification\tsample1\tsample2\n" "d__Bacteria|p__Pseudomonadota|g__Pseudomonas|s__Pseudomonas_aeruginosa\t300\t100\n" "d__Bacteria|p__Pseudomonadota|g__Pseudomonas\t500\t200\n" @@ -30,13 +45,14 @@ @pytest.fixture def combined_mpa_file(tmp_path): + """Combined MPA file spanning multiple ranks, domains, and a human taxon.""" f = tmp_path / "COMBINED.txt" - f.write_text(SAMPLE_COMBINED_MPA) + f.write_text(_COMBINED_MPA_TEXT) return f # --------------------------------------------------------------------------- -# Helpers +# Helper # --------------------------------------------------------------------------- @@ -44,9 +60,9 @@ def _sha256(path: Path) -> str: return hashlib.sha256(path.read_bytes()).hexdigest() -# --------------------------------------------------------------------------- +# =========================================================================== # kreport_to_mpa -# --------------------------------------------------------------------------- +# =========================================================================== def test_kreport_to_mpa_reproducible(kreport_file, tmp_path): @@ -60,11 +76,10 @@ def run(): assert run() == run() -def test_kreport_to_mpa_standard_ranks_only(kreport_file, tmp_path): +def test_kreport_to_mpa_standard_ranks_are_present(kreport_file, tmp_path): out = tmp_path / "out.MPA.TXT" kreport_to_mpa(kreport_file, out) - lines = out.read_text().splitlines() - paths = [ln.split("\t")[0] for ln in lines] + paths = [ln.split("\t")[0] for ln in out.read_text().splitlines()] assert any("p__Pseudomonadota" in p for p in paths) assert any("s__Pseudomonas_aeruginosa" in p for p in paths) @@ -75,14 +90,16 @@ def test_kreport_to_mpa_excludes_unclassified_and_root(kreport_file, tmp_path): out = tmp_path / "out.MPA.TXT" kreport_to_mpa(kreport_file, out) content = out.read_text() + assert "unclassified" not in content assert "root" not in content -def test_kreport_to_mpa_display_header(kreport_file, tmp_path): +def test_kreport_to_mpa_display_header_includes_filename(kreport_file, tmp_path): out = tmp_path / "out.MPA.TXT" kreport_to_mpa(kreport_file, out, display_header=True) first_line = out.read_text().splitlines()[0] + assert first_line.startswith("#Classification") assert kreport_file.name in first_line @@ -90,17 +107,27 @@ def test_kreport_to_mpa_display_header(kreport_file, tmp_path): def test_kreport_to_mpa_paths_are_hierarchical(kreport_file, tmp_path): out = tmp_path / "out.MPA.TXT" kreport_to_mpa(kreport_file, out) - lines = out.read_text().splitlines() - for ln in lines: + + for ln in out.read_text().splitlines(): path = ln.split("\t")[0] - segments = path.split("|") - for seg in segments: + for seg in path.split("|"): assert "__" in seg, f"Unexpected MPA segment format: {seg!r}" -# --------------------------------------------------------------------------- +def test_kreport_to_mpa_creates_output_dir(kreport_file, tmp_path): + out = tmp_path / "new_subdir" / "out.MPA.TXT" + kreport_to_mpa(kreport_file, out) + assert out.exists() + + +def test_kreport_to_mpa_missing_input_raises(tmp_path): + with pytest.raises(FileNotFoundError): + kreport_to_mpa(tmp_path / "ghost.kreport", tmp_path / "out.MPA.TXT") + + +# =========================================================================== # convert_to_csv -# --------------------------------------------------------------------------- +# =========================================================================== def test_convert_to_csv_reproducible(counts_txt_file, tmp_path): @@ -124,9 +151,20 @@ def test_convert_to_csv_transposes_correctly(counts_txt_file, tmp_path): assert "Pseudomonas aeruginosa" in df.columns -# --------------------------------------------------------------------------- +def test_convert_to_csv_creates_output_dir(counts_txt_file, tmp_path): + out = tmp_path / "new_subdir" / "counts.csv" + convert_to_csv(counts_txt_file, out) + assert out.exists() + + +def test_convert_to_csv_missing_input_raises(tmp_path): + with pytest.raises(FileNotFoundError): + convert_to_csv(tmp_path / "ghost.txt", tmp_path / "out.csv") + + +# =========================================================================== # process_files -# --------------------------------------------------------------------------- +# =========================================================================== def test_process_files_adds_header_and_cleans_names(tmp_path): @@ -142,6 +180,7 @@ def test_process_files_adds_header_and_cleans_names(tmp_path): process_files(source, dest) result = dest.read_text() lines = result.splitlines() + assert lines[0] == "#Classification\tsample1\tsample2" assert "Pseudomonas aeruginosa" in result assert "Escherichia coli" in result @@ -155,6 +194,7 @@ def test_process_files_reproducible(tmp_path): dest = tmp_path / f"counts_{i}.txt" dest.write_text("s__Some_species\t10\n") process_files(source, dest) + assert (tmp_path / "counts_0.txt").read_text() == ( tmp_path / "counts_1.txt" ).read_text() @@ -174,14 +214,9 @@ def test_process_files_missing_dest_raises(tmp_path): process_files(source, tmp_path / "ghost.txt") -def test_convert_to_csv_missing_input_raises(tmp_path): - with pytest.raises(FileNotFoundError): - convert_to_csv(tmp_path / "ghost.txt", tmp_path / "out.csv") - - -# --------------------------------------------------------------------------- +# =========================================================================== # calculate_rel_abund -# --------------------------------------------------------------------------- +# =========================================================================== def test_relabund_reproducible(counts_csv_file, tmp_path): @@ -199,6 +234,7 @@ def test_relabund_sums_to_100_per_sample(counts_csv_file, tmp_path): out = tmp_path / "ra.csv" calculate_rel_abund(counts_csv_file, out) df = pd.read_csv(out) + for sample, grp in df.groupby("Sample_id"): total = grp["rel_abund_perc"].sum() assert total == pytest.approx(100.0, abs=1e-6), f"{sample}: sum={total}" @@ -211,21 +247,27 @@ def test_relabund_other_threshold_creates_other_group(counts_csv_file, tmp_path) assert df["taxon"].str.startswith("Other").any() -def test_relabund_no_zero_rows(counts_csv_file, tmp_path): +def test_relabund_no_zero_abundance_rows(counts_csv_file, tmp_path): out = tmp_path / "ra.csv" calculate_rel_abund(counts_csv_file, out) df = pd.read_csv(out) assert (df["rel_abund_perc"] > 0).all() +def test_relabund_creates_output_dir(counts_csv_file, tmp_path): + out = tmp_path / "new_subdir" / "ra.csv" + calculate_rel_abund(counts_csv_file, out) + assert out.exists() + + def test_relabund_missing_input_raises(tmp_path): with pytest.raises(FileNotFoundError): calculate_rel_abund(tmp_path / "ghost.csv", tmp_path / "out.csv") -# --------------------------------------------------------------------------- +# =========================================================================== # calc_alpha_div -# --------------------------------------------------------------------------- +# =========================================================================== def test_alpha_div_reproducible(counts_csv_file, tmp_path): @@ -247,11 +289,12 @@ def test_alpha_div_output_columns(counts_csv_file, tmp_path): out_dir.mkdir() calc_alpha_div(df, out_dir) result = pd.read_csv(out_dir / "alpha_div.csv") + assert set(result.columns) == {"Sample", "Shannon", "Pielou", "Chao1"} assert len(result) == len(df) -def test_alpha_div_shannon_non_negative(counts_csv_file, tmp_path): +def test_alpha_div_shannon_is_non_negative(counts_csv_file, tmp_path): df = pd.read_csv(counts_csv_file, index_col=0) out_dir = tmp_path / "diversity" out_dir.mkdir() @@ -260,9 +303,16 @@ def test_alpha_div_shannon_non_negative(counts_csv_file, tmp_path): assert (result["Shannon"] >= 0).all() -# --------------------------------------------------------------------------- +def test_alpha_div_creates_output_dir(counts_csv_file, tmp_path): + df = pd.read_csv(counts_csv_file, index_col=0) + out_dir = tmp_path / "new_dir" / "nested" + calc_alpha_div(df, out_dir) + assert (out_dir / "alpha_div.csv").exists() + + +# =========================================================================== # calc_beta_div -# --------------------------------------------------------------------------- +# =========================================================================== def test_beta_div_output_files_exist(counts_csv_file, tmp_path): @@ -270,6 +320,7 @@ def test_beta_div_output_files_exist(counts_csv_file, tmp_path): out_dir = tmp_path / "diversity" out_dir.mkdir() calc_beta_div(df, out_dir, rarefaction_depth=1000) + assert (out_dir / "beta_div_bray.csv").exists() assert (out_dir / "beta_div_jaccard.csv").exists() @@ -280,6 +331,7 @@ def test_beta_div_matrix_is_square(counts_csv_file, tmp_path): out_dir.mkdir() calc_beta_div(df, out_dir, rarefaction_depth=1000) bray = pd.read_csv(out_dir / "beta_div_bray.csv", index_col=0) + assert bray.shape[0] == bray.shape[1] @@ -289,22 +341,28 @@ def test_beta_div_diagonal_is_zero(counts_csv_file, tmp_path): out_dir.mkdir() calc_beta_div(df, out_dir, rarefaction_depth=1000) bray = pd.read_csv(out_dir / "beta_div_bray.csv", index_col=0) - import numpy as np assert np.allclose(np.diag(bray.values), 0.0) def test_beta_div_too_few_samples_raises(tmp_path): - df = pd.DataFrame({"Taxon_A": [100], "Taxon_B": [200]}, index=["S1"]) + df = pd.DataFrame({"Taxon_A": [100], "Taxon_B": [200]}, index=["S1"]) # ty:ignore[invalid-argument-type] out_dir = tmp_path / "diversity" out_dir.mkdir() with pytest.raises(ValueError, match="rarefaction"): calc_beta_div(df, out_dir, rarefaction_depth=1000) -# --------------------------------------------------------------------------- +def test_beta_div_creates_output_dir(counts_csv_file, tmp_path): + df = pd.read_csv(counts_csv_file, index_col=0) + out_dir = tmp_path / "new_dir" / "nested" + calc_beta_div(df, out_dir, rarefaction_depth=1000, seed=42) + assert (out_dir / "beta_div_bray.csv").exists() + + +# =========================================================================== # split_mpa -# --------------------------------------------------------------------------- +# =========================================================================== def test_split_mpa_creates_all_rank_files(combined_mpa_file, tmp_path): @@ -327,8 +385,9 @@ def run(): def test_split_mpa_filters_human_by_default(combined_mpa_file, tmp_path): split_mpa(combined_mpa_file, tmp_path) species = (tmp_path / "txt" / "counts_species.txt").read_text() - assert "Homo_sapiens" not in species genus = (tmp_path / "txt" / "counts_genus.txt").read_text() + + assert "Homo_sapiens" not in species assert "g__Homo" not in genus @@ -338,84 +397,75 @@ def test_split_mpa_keep_human_retains_homo(combined_mpa_file, tmp_path): assert "Homo_sapiens" in species -def test_split_mpa_viruses_only(combined_mpa_file, tmp_path): +def test_split_mpa_viruses_only_excludes_bacteria(combined_mpa_file, tmp_path): split_mpa(combined_mpa_file, tmp_path, viruses_only=True) species = (tmp_path / "txt" / "counts_species.txt").read_text() + assert "Virus_alpha" in species assert "Pseudomonas_aeruginosa" not in species -def test_split_mpa_strips_path_prefix(combined_mpa_file, tmp_path): +def test_split_mpa_strips_pipe_path_prefix(combined_mpa_file, tmp_path): split_mpa(combined_mpa_file, tmp_path) species = (tmp_path / "txt" / "counts_species.txt").read_text() + assert "|" not in species assert "s__" in species -def test_split_mpa_genus_excludes_species_lines(combined_mpa_file, tmp_path): +def test_split_mpa_genus_file_excludes_species_lines(combined_mpa_file, tmp_path): split_mpa(combined_mpa_file, tmp_path) genus = (tmp_path / "txt" / "counts_genus.txt").read_text() assert "s__" not in genus -def test_split_mpa_missing_input_raises(tmp_path): - with pytest.raises(FileNotFoundError): - split_mpa(tmp_path / "ghost.txt", tmp_path / "out") - - -# --------------------------------------------------------------------------- -# auto-create output directories (ensure_output_dir behaviour) -# --------------------------------------------------------------------------- - - -def test_kreport_to_mpa_creates_output_dir(kreport_file, tmp_path): - out = tmp_path / "new_subdir" / "out.MPA.TXT" - kreport_to_mpa(kreport_file, out) - assert out.exists() +def test_split_mpa_filters_terminal_rank_nodes(tmp_path): + combined = tmp_path / "COMBINED.txt" + combined.write_text( + "#Classification\tsample1\n" + "d__Bacteria|p__Pseudomonadota|s__Pseudomonas_aeruginosa\t300\n" + "d__Bacteria|p__Pseudomonadota|s__Pseudomonas_aeruginosa|t__strain_X\t10\n" + ) + split_mpa(combined, tmp_path / "out") + species = (tmp_path / "out" / "txt" / "counts_species.txt").read_text() + assert "t__" not in species -def test_kreport_to_mpa_missing_input_raises(tmp_path): - with pytest.raises(FileNotFoundError): - kreport_to_mpa(tmp_path / "ghost.kreport", tmp_path / "out.MPA.TXT") +def test_split_mpa_domain_filters(tmp_path): + input_mpa = tmp_path / "input_mpa.txt" + input_mpa.write_text( + "#Classification\tsample1\n" + "d__Bacteria|p__Bacillota\t50\n" + "d__Archaea|p__Methanobacteriota\t30\n" + "k__Fungi|p__Ascomycota\t20\n" + ) + out_bact = tmp_path / "out_bact" + split_mpa(input_mpa, out_bact, bacteria_only=True) -def test_convert_to_csv_creates_output_dir(counts_txt_file, tmp_path): - out = tmp_path / "new_subdir" / "counts.csv" - convert_to_csv(counts_txt_file, out) - assert out.exists() + out_fungi = tmp_path / "out_fungi" + split_mpa(input_mpa, out_fungi, fungi_only=True) + out_arch = tmp_path / "out_arch" + split_mpa(input_mpa, out_arch, archaea_only=True) -def test_relabund_creates_output_dir(counts_csv_file, tmp_path): - out = tmp_path / "new_subdir" / "ra.csv" - calculate_rel_abund(counts_csv_file, out) - assert out.exists() + assert out_bact.exists() + assert out_fungi.exists() + assert out_arch.exists() -def test_alpha_div_creates_output_dir(counts_csv_file, tmp_path): - df = pd.read_csv(counts_csv_file, index_col=0) - out_dir = tmp_path / "new_dir" / "nested" - calc_alpha_div(df, out_dir) - assert (out_dir / "alpha_div.csv").exists() - - -def test_beta_div_creates_output_dir(counts_csv_file, tmp_path): - df = pd.read_csv(counts_csv_file, index_col=0) - out_dir = tmp_path / "new_dir" / "nested" - calc_beta_div(df, out_dir, rarefaction_depth=1000, seed=42) - assert (out_dir / "beta_div_bray.csv").exists() - +def test_split_mpa_missing_input_raises(tmp_path): + with pytest.raises(FileNotFoundError): + split_mpa(tmp_path / "ghost.txt", tmp_path / "out") -# --------------------------------------------------------------------------- -# combine_mpa — new input validation -# --------------------------------------------------------------------------- -SAMPLE_MPA_A = "#Classification\tsample1\nd__Bacteria|s__Pseudomonas_aeruginosa\t300\n" -SAMPLE_MPA_B = "#Classification\tsample2\nd__Bacteria|s__Pseudomonas_aeruginosa\t100\n" +# =========================================================================== +# combine_mpa +# =========================================================================== def test_combine_mpa_creates_output_dir(tmp_path): - a = tmp_path / "a.MPA.TXT" - b = tmp_path / "b.MPA.TXT" + a, b = tmp_path / "a.MPA.TXT", tmp_path / "b.MPA.TXT" a.write_text(SAMPLE_MPA_A) b.write_text(SAMPLE_MPA_B) out = tmp_path / "new_subdir" / "COMBINED.txt" @@ -430,59 +480,43 @@ def test_combine_mpa_missing_input_raises(tmp_path): combine_mpa([existing, tmp_path / "ghost.MPA.TXT"], tmp_path / "out.txt") -# --------------------------------------------------------------------------- -# process_files — destination must already exist (in-place modifier) -# --------------------------------------------------------------------------- +# =========================================================================== +# process_files — additional destination contract +# =========================================================================== def test_process_files_missing_dest_still_raises(tmp_path): + """process_files is an in-place modifier; the destination must already exist.""" source = tmp_path / "COMBINED.txt" source.write_text("#Classification\tsample1.kreport\n") with pytest.raises(FileNotFoundError): process_files(source, tmp_path / "nonexistent.txt") -# --------------------------------------------------------------------------- -# split_mpa — t__ rank filter (intermediate terminal nodes) -# --------------------------------------------------------------------------- - - -def test_split_mpa_filters_terminal_rank_nodes(tmp_path): - combined = tmp_path / "COMBINED.txt" - combined.write_text( - "#Classification\tsample1\n" - "d__Bacteria|p__Pseudomonadota|s__Pseudomonas_aeruginosa\t300\n" - "d__Bacteria|p__Pseudomonadota|s__Pseudomonas_aeruginosa|t__strain_X\t10\n" - ) - split_mpa(combined, tmp_path / "out") - species = (tmp_path / "out" / "txt" / "counts_species.txt").read_text() - assert "t__" not in species - - -# --------------------------------------------------------------------------- -# _is_processable — hidden files, null bytes, non-UTF-8 -# --------------------------------------------------------------------------- +# =========================================================================== +# _is_processable +# =========================================================================== -def test_is_processable_hidden_file(tmp_path): +def test_is_processable_rejects_hidden_file(tmp_path): f = tmp_path / ".hidden" f.write_text("content") assert not _is_processable(f) -def test_is_processable_null_bytes(tmp_path): +def test_is_processable_rejects_null_bytes(tmp_path): f = tmp_path / "binary.bin" f.write_bytes(b"hello\x00world") assert not _is_processable(f) -def test_is_processable_non_utf8(tmp_path): +def test_is_processable_rejects_non_utf8(tmp_path): f = tmp_path / "latin1.txt" f.write_bytes(b"\xff\xfe bad encoding") assert not _is_processable(f) -def test_is_processable_valid_kreport(tmp_path): +def test_is_processable_accepts_valid_kreport(tmp_path): f = tmp_path / "sample.kreport" f.write_text("50.0\t500\t100\tS\t1\tBacteria\n") assert _is_processable(f) From d087e7799e45f55344120aacde76933fb13fc97e Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Thu, 4 Jun 2026 09:56:59 +0200 Subject: [PATCH 21/25] refactor(tests): structure visualization layers, contract assertions, and full-param pipelines in test_kpplot.py --- tests/test_kpplot.py | 157 +++++++++++++++++++++++++++++++++---------- 1 file changed, 122 insertions(+), 35 deletions(-) diff --git a/tests/test_kpplot.py b/tests/test_kpplot.py index a85f74d..48beae3 100644 --- a/tests/test_kpplot.py +++ b/tests/test_kpplot.py @@ -1,5 +1,19 @@ -"""kpplot smoke tests and parameter-validation tests.""" - +"""kpplot smoke, parameter-validation, and output-contract tests. + +Sections +-------- +1. Smoke tests — each plot function returns a KpPlotBase without error. +2. sample_order — ValueError when requested samples are missing from the data. +3. cmap validation — ValueError on too-short or wrong-type colour maps. +4. aggregate_by_metadata— aggregation logic and missing-column validation. +5. Base class methods — plotfig() return value and savefig() filesystem contract. +6. Full-parameter smoke — metadata, sample_order, cmap, title all wired together. +""" + +import os + +import matplotlib.pyplot as plt +import pandas as pd import pytest from krakenparser.kpplot.base import KpPlotBase, aggregate_by_metadata @@ -8,47 +22,55 @@ from krakenparser.kpplot.streamgraph import streamgraph # --------------------------------------------------------------------------- -# Smoke tests — verify each plot function returns without error +# Module-local fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture +def cohort_metadata(): + """Metadata DataFrame mapping S1→CohortA and S2→CohortB.""" + return pd.DataFrame({"Sample_id": ["S1", "S2"], "Group": ["CohortA", "CohortB"]}) + + +# --------------------------------------------------------------------------- +# 1. Smoke tests — happy-path return type # --------------------------------------------------------------------------- def test_stackedbar_returns_kpplotbase(relabund_df): - result = stacked_barplot(relabund_df) - assert isinstance(result, KpPlotBase) + assert isinstance(stacked_barplot(relabund_df), KpPlotBase) def test_streamgraph_returns_kpplotbase(relabund_df): - result = streamgraph(relabund_df) - assert isinstance(result, KpPlotBase) + assert isinstance(streamgraph(relabund_df), KpPlotBase) def test_clustermap_returns_kpplotbase(relabund_df): - result = clustermap(relabund_df) - assert isinstance(result, KpPlotBase) + assert isinstance(clustermap(relabund_df), KpPlotBase) # --------------------------------------------------------------------------- -# sample_order validation +# 2. sample_order validation # --------------------------------------------------------------------------- -def test_stackedbar_sample_order_missing_raises(relabund_df): +def test_stackedbar_unknown_sample_in_order_raises(relabund_df): with pytest.raises(ValueError, match="Samples missing"): stacked_barplot(relabund_df, sample_order=["S1", "S2", "GHOST"]) -def test_streamgraph_sample_order_missing_raises(relabund_df): +def test_streamgraph_unknown_sample_in_order_raises(relabund_df): with pytest.raises(ValueError, match="Samples missing"): streamgraph(relabund_df, sample_order=["S1", "GHOST"]) -def test_clustermap_sample_order_missing_raises(relabund_df): +def test_clustermap_unknown_sample_in_order_raises(relabund_df): with pytest.raises(ValueError, match="Samples missing"): clustermap(relabund_df, sample_order=["S1", "GHOST"]) # --------------------------------------------------------------------------- -# cmap validation (stackedbar / streamgraph) +# 3. cmap validation (stackedbar / streamgraph) # --------------------------------------------------------------------------- @@ -62,49 +84,114 @@ def test_streamgraph_cmap_too_short_raises(relabund_df): streamgraph(relabund_df, cmap=["red"]) -def test_stackedbar_cmap_invalid_type_raises(relabund_df): +def test_stackedbar_cmap_wrong_type_raises(relabund_df): with pytest.raises(ValueError, match="cmap"): - stacked_barplot(relabund_df, cmap=123) + stacked_barplot(relabund_df, cmap=123) # ty:ignore[invalid-argument-type] -def test_streamgraph_cmap_invalid_type_raises(relabund_df): +def test_streamgraph_cmap_wrong_type_raises(relabund_df): with pytest.raises(ValueError, match="cmap"): - streamgraph(relabund_df, cmap=123) + streamgraph(relabund_df, cmap=123) # ty:ignore[invalid-argument-type] # --------------------------------------------------------------------------- -# aggregate_by_metadata +# 4. aggregate_by_metadata # --------------------------------------------------------------------------- -def test_aggregate_by_metadata_basic(relabund_df): - import pandas as pd - - metadata = pd.DataFrame( - { - "Sample_id": ["S1", "S2"], - "Group": ["A", "A"], - } - ) +def test_aggregate_by_metadata_groups_samples_correctly(relabund_df): + metadata = pd.DataFrame({"Sample_id": ["S1", "S2"], "Group": ["A", "A"]}) result = aggregate_by_metadata(relabund_df, metadata, "Group") + assert "Sample_id" in result.columns assert set(result["Sample_id"]) == {"A"} - # Relative abundance should still sum to 100 per group - total = result["rel_abund_perc"].sum() - assert total == pytest.approx(100.0, abs=1e-6) -def test_aggregate_by_metadata_missing_sample_id_column_raises(relabund_df): - import pandas as pd +def test_aggregate_by_metadata_relative_abundance_sums_to_100(relabund_df): + metadata = pd.DataFrame({"Sample_id": ["S1", "S2"], "Group": ["A", "A"]}) + result = aggregate_by_metadata(relabund_df, metadata, "Group") + assert result["rel_abund_perc"].sum() == pytest.approx(100.0, abs=1e-6) + +def test_aggregate_by_metadata_missing_sample_id_column_raises(relabund_df): bad_meta = pd.DataFrame({"Group": ["A", "B"], "X": [1, 2]}) with pytest.raises(ValueError, match="Sample_id"): aggregate_by_metadata(relabund_df, bad_meta, "Group") def test_aggregate_by_metadata_missing_group_column_raises(relabund_df): - import pandas as pd - meta = pd.DataFrame({"Sample_id": ["S1", "S2"]}) with pytest.raises(ValueError, match="Group"): aggregate_by_metadata(relabund_df, meta, "Group") + + +# --------------------------------------------------------------------------- +# 5. KpPlotBase methods — plotfig() and savefig() +# --------------------------------------------------------------------------- + + +def test_kpplotbase_plotfig_returns_figure(relabund_df): + ctx = stacked_barplot(df=relabund_df) + assert ctx.plotfig() is not None + + +def test_kpplotbase_savefig_writes_file(relabund_df, tmp_path): + ctx = stacked_barplot(df=relabund_df) + img_path = tmp_path / "output.png" + ctx.savefig(img_path) + assert os.path.exists(img_path) + + +# --------------------------------------------------------------------------- +# 6. Full-parameter smoke — metadata, sample_order, cmap, title +# --------------------------------------------------------------------------- + + +def test_stackedbar_with_all_parameters(relabund_df, two_sample_metadata, tmp_path): + custom_colors = ["#ff0000", "#00ff00", "#0000ff"] + ctx = stacked_barplot( + df=relabund_df, + metadata=two_sample_metadata, + metadata_group="Group", + sample_order=["TypeA", "TypeB"], + cmap=custom_colors, + title="Full-Parameter Barplot", + ) + assert ctx.fig is not None + + out = tmp_path / "barplot.png" + ctx.fig.savefig(out, dpi=150, bbox_inches="tight") + assert out.exists() + + plt.close(ctx.fig) + + +def test_streamgraph_with_all_parameters(relabund_df, two_sample_metadata): + ctx = streamgraph( + df=relabund_df, + metadata=two_sample_metadata, + metadata_group="Group", + sample_order=["TypeA", "TypeB"], + cmap=["#ff0000", "#00ff00", "#0000ff"], + title="Full-Parameter Streamgraph", + ) + assert ctx.fig is not None + plt.close(ctx.fig) + + +def test_clustermap_with_all_parameters(relabund_df, cohort_metadata): + ctx = clustermap( + df=relabund_df, + metadata=cohort_metadata, + metadata_group="Group", + sample_order=["CohortA", "CohortB"], + title="Full-Parameter Clustermap", + xlabel="X Label", + ylabel="Y Label", + ) + assert ctx.grid is not None + + +def test_clustermap_unknown_sample_in_order_raises_with_message(relabund_df): + with pytest.raises(ValueError, match="Samples missing from the clustermap matrix"): + clustermap(df=relabund_df, sample_order=["GHOST_SAMPLE"]) From 583d6a846f21d2cdd2b4167826e889eab6d0d894 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Thu, 4 Jun 2026 09:57:57 +0200 Subject: [PATCH 22/25] refactor(tests): group logic flows and add descriptive assertions to test_units.py --- tests/test_units.py | 114 +++++++++++++++++++++++--------------------- 1 file changed, 60 insertions(+), 54 deletions(-) diff --git a/tests/test_units.py b/tests/test_units.py index 598d307..9e4891b 100644 --- a/tests/test_units.py +++ b/tests/test_units.py @@ -1,4 +1,9 @@ -"""Pure-function unit tests — no I/O, fully deterministic.""" +"""Pure-function unit tests — no I/O, no fixtures, fully deterministic. + +All functions under test are stateless mathematical transforms or text +processors. Tests are grouped by function and ordered from simplest +(zero/one element) to more complex (multi-element, edge cases). +""" import math from pathlib import Path @@ -11,14 +16,15 @@ from krakenparser.stats.diversity import chao1_index, pielou_evenness, shannon_index from krakenparser.utils import ensure_output_dir -# --------------------------------------------------------------------------- +# =========================================================================== # _parse_line -# --------------------------------------------------------------------------- +# =========================================================================== def test_parse_line_standard_rank(): line = "50.00\t500000\t100000\tP\t1224\t Pseudomonadota\n" name, depth, rank, cum_reads, pct = _parse_line(line) + assert name == "Pseudomonadota" assert depth == 2 # 4 leading spaces // 2 assert rank == "P" @@ -26,9 +32,10 @@ def test_parse_line_standard_rank(): assert pct == 50.0 -def test_parse_line_root_no_indent(): +def test_parse_line_root_has_zero_depth(): line = "99.98\t999980\t0\tR\t1\troot\n" name, depth, rank, cum_reads, pct = _parse_line(line) + assert name == "root" assert depth == 0 assert rank == "R" @@ -37,47 +44,48 @@ def test_parse_line_root_no_indent(): def test_parse_line_intermediate_rank(): line = "5.00\t50000\t0\tS1\t12345\t Some subspecies\n" name, depth, rank, _, _ = _parse_line(line) + assert name == "Some subspecies" assert rank == "S1" assert depth == 5 # 10 spaces // 2 -def test_parse_line_too_few_columns(): +def test_parse_line_too_few_columns_returns_empty(): assert _parse_line("50.00\t500000\n") == [] -def test_parse_line_non_numeric_pct(): +def test_parse_line_non_numeric_pct_returns_empty(): assert _parse_line("not_a_float\t500000\t0\tP\t1224\tBacteria\n") == [] -def test_parse_line_non_numeric_reads(): +def test_parse_line_non_numeric_reads_returns_empty(): assert _parse_line("50.00\tnot_int\t0\tP\t1224\tBacteria\n") == [] -# --------------------------------------------------------------------------- +# =========================================================================== # shannon_index -# --------------------------------------------------------------------------- - - -def test_shannon_uniform_four_species(): - assert abs(shannon_index([1, 1, 1, 1]) - math.log(4)) < 1e-10 +# =========================================================================== def test_shannon_single_species_is_zero(): assert shannon_index([100]) == pytest.approx(0.0) -def test_shannon_ignores_zero_counts(): - assert abs(shannon_index([1, 1, 1, 1, 0, 0]) - math.log(4)) < 1e-10 - - def test_shannon_two_equal_species(): assert abs(shannon_index([50, 50]) - math.log(2)) < 1e-10 -# --------------------------------------------------------------------------- +def test_shannon_uniform_four_species(): + assert abs(shannon_index([1, 1, 1, 1]) - math.log(4)) < 1e-10 + + +def test_shannon_ignores_zero_counts(): + assert abs(shannon_index([1, 1, 1, 1, 0, 0]) - math.log(4)) < 1e-10 + + +# =========================================================================== # pielou_evenness -# --------------------------------------------------------------------------- +# =========================================================================== def test_pielou_single_species_returns_nan(): @@ -88,112 +96,110 @@ def test_pielou_all_zeros_returns_nan(): assert math.isnan(pielou_evenness([0, 0, 0])) -def test_pielou_uniform_equals_one(): +def test_pielou_uniform_distribution_equals_one(): assert pielou_evenness([1, 1, 1, 1]) == pytest.approx(1.0) -def test_pielou_range_zero_to_one(): +def test_pielou_unequal_distribution_is_between_zero_and_one(): result = pielou_evenness([10, 2, 1, 1]) assert 0.0 < result < 1.0 -# --------------------------------------------------------------------------- +# =========================================================================== # chao1_index -# --------------------------------------------------------------------------- +# =========================================================================== -def test_chao1_f2_zero_uses_f1_formula(): - # F1=3, F2=0 → S_obs + F1*(F1-1)/2 - counts = [1, 1, 1, 5, 10] # F1=3, F2=0, S_obs=5 - expected = 5 + 3 * (3 - 1) / 2 # 5 + 3 = 8 - assert chao1_index(counts) == pytest.approx(expected) +def test_chao1_normal_f1_and_f2_present(): + # F1=2, F2=2, S_obs=5 → S_obs + F1² / (2 * F2) = 5 + 1 = 6 + counts = [1, 1, 2, 2, 5] + assert chao1_index(counts) == pytest.approx(6.0) -def test_chao1_normal(): - # F1=2, F2=2, S_obs=5 → S_obs + F1²/(2*F2) - counts = [1, 1, 2, 2, 5] - expected = 5 + (2 * 2) / (2 * 2) # 5 + 1 = 6 - assert chao1_index(counts) == pytest.approx(expected) +def test_chao1_f2_zero_falls_back_to_f1_formula(): + # F1=3, F2=0 → S_obs + F1*(F1-1)/2 = 5 + 3 = 8 + counts = [1, 1, 1, 5, 10] + assert chao1_index(counts) == pytest.approx(8.0) -def test_chao1_no_singletons(): +def test_chao1_no_singletons_or_doubletons_returns_s_obs(): + # F1=0, F2=0 → S_obs + 0 = 4 counts = [5, 10, 15, 20] - # F1=0, F2=0 → uses F2==0 branch: S_obs + 0 assert chao1_index(counts) == pytest.approx(4.0) -# --------------------------------------------------------------------------- +# =========================================================================== # modify_taxa_names -# --------------------------------------------------------------------------- +# =========================================================================== -def test_modify_taxa_names_strips_prefix_and_replaces_underscores(): +def test_modify_taxa_names_strips_species_prefix_and_underscores(): assert modify_taxa_names("s__Homo_sapiens\t100\t200") == "Homo sapiens\t100\t200" -def test_modify_taxa_names_genus(): +def test_modify_taxa_names_strips_genus_prefix(): assert modify_taxa_names("g__Escherichia_coli\t50") == "Escherichia coli\t50" -def test_modify_taxa_names_all_prefixes(): +def test_modify_taxa_names_handles_all_standard_prefixes(): for prefix in ["s__", "g__", "f__", "o__", "c__", "p__"]: result = modify_taxa_names(f"{prefix}Some_name\t10") assert result == "Some name\t10" -def test_modify_taxa_names_no_prefix_unchanged(): +def test_modify_taxa_names_leaves_unprefixed_lines_unchanged(): line = "unclassified_reads\t100" assert modify_taxa_names(line) == line -def test_modify_taxa_names_count_fields_not_modified(): - # Underscores in tab-separated count fields must be preserved +def test_modify_taxa_names_preserves_underscores_in_count_fields(): + # Underscores in tab-separated count fields must not be replaced. result = modify_taxa_names("s__My_taxon\t1_000\t2_000") assert result == "My taxon\t1_000\t2_000" -# --------------------------------------------------------------------------- +# =========================================================================== # _strip_path_prefix -# --------------------------------------------------------------------------- +# =========================================================================== -def test_strip_path_prefix_tab_less_line(): +def test_strip_path_prefix_line_without_tab_is_unchanged(): assert _strip_path_prefix("no_tab_here") == "no_tab_here" -def test_strip_path_prefix_normal(): +def test_strip_path_prefix_removes_all_but_last_path_segment(): assert ( _strip_path_prefix("d__Bacteria|s__E_coli\t100\t200") == "s__E_coli\t100\t200" ) -# --------------------------------------------------------------------------- +# =========================================================================== # ensure_output_dir -# --------------------------------------------------------------------------- +# =========================================================================== -def test_ensure_output_dir_file_creates_parent(tmp_path): +def test_ensure_output_dir_creates_parent_for_file_path(tmp_path): p = ensure_output_dir(tmp_path / "subdir" / "output.csv", is_file=True) assert (tmp_path / "subdir").is_dir() assert not p.exists() # only the parent is created, not the file itself -def test_ensure_output_dir_dir_creates_directory(tmp_path): +def test_ensure_output_dir_creates_directory(tmp_path): p = ensure_output_dir(tmp_path / "output_dir", is_file=False) assert p.is_dir() -def test_ensure_output_dir_nested_creates_all_parents(tmp_path): +def test_ensure_output_dir_creates_nested_directories(tmp_path): p = ensure_output_dir(tmp_path / "a" / "b" / "c", is_file=False) assert p.is_dir() -def test_ensure_output_dir_returns_path_object(tmp_path): +def test_ensure_output_dir_returns_path_object_for_string_input(tmp_path): p = ensure_output_dir(str(tmp_path / "out.csv"), is_file=True) assert isinstance(p, Path) -def test_ensure_output_dir_idempotent_for_existing_dir(tmp_path): +def test_ensure_output_dir_is_idempotent_for_existing_directory(tmp_path): existing = tmp_path / "already_exists" existing.mkdir() p = ensure_output_dir(existing, is_file=False) From e8f0c9c13c1778a6c03b602846f93b8caea232c3 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Thu, 4 Jun 2026 09:59:02 +0200 Subject: [PATCH 23/25] docs(paper): add disclosures section to paper.md --- paper/paper.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/paper/paper.md b/paper/paper.md index 534e249..089b914 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -55,4 +55,8 @@ The `kpplot` module utilizes an object-oriented design inheriting from a unified The functional reliability and execution integrity of `KrakenParser` are validated via automated continuous integration workflows. The utility and user readiness of the software were demonstrated during the [2025 “Bioinformatics Bootcamp”](https://pish.itmo.ru/genomics-bootcamp) hackathon. Furthermore, the core structural prototype of this tool was successfully utilized for large-scale metagenomic data analysis by Popov et al., 2025 [@ijms26135941]. +# AI usage disclosure + +Generative AI tools were used during the development of this work to assist with code refactoring, documentation drafting, and manuscript text editing. All software design decisions, implementation, validation, and scientific interpretation were performed and reviewed by the authors. No generative AI tools were used to generate or analyze research data, and all results reported are reproducible from the publicly available source code and documentation. + # References \ No newline at end of file From dbf783677ec858ec050159351e5f7dbe338c1c0f Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Thu, 4 Jun 2026 09:59:41 +0200 Subject: [PATCH 24/25] chore(release): bump version to 1.1.2 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 3ca793a..a19c147 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "krakenparser" -version = "1.1.1" +version = "1.1.2" description = "A collection of scripts designed to process Kraken2 reports and convert them into CSV format." readme = {file = "README.md", content-type = "text/markdown"} license = {file = "LICENSE"} From bcbe409a91b0c3b48528ba15ee8ea2e7bf94691a Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Thu, 4 Jun 2026 10:26:21 +0200 Subject: [PATCH 25/25] docs(paper): fix layout spacing in paper.md --- paper/paper.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/paper/paper.md b/paper/paper.md index 089b914..28c33d9 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -32,11 +32,13 @@ Comparative metagenomics and microbiome studies depend fundamentally on cross-sa `KrakenParser` is implemented in Python 3 (distributed via PyPI as `krakenparser`) and follows a modular architecture split into three distinct operational layers: Data Processing, Statistical Analysis, and Visualization. The pipeline can be executed in an end-to-end automated mode by providing global input and output paths directly to the main command, or controlled step-by-step through granular subcommands. ## Data Processing and Filtering + Individual taxonomic reports are programmatically parsed, converted into MetaPhlAn (MPA) tables, and merged into a unified cross-sample master count matrix. This matrix is subsequently deconstructed into distinct tables for each major taxonomic rank. During deconstruction, `KrakenParser` purges internal structural prefixes (e.g., stripping `s__` from species names) and normalizes taxonomic strings by replacing underscores with spaces to ensure human readability and compatibility with downstream software. The core data engine features flexible filtering mechanisms. Users can selectively isolate or exclude specific biological domains or kingdoms (Bacteria, Viruses, Archaea, Fungi) during extraction. While non-target host reads (e.g., human contamination) are filtered out by default to focus on microbial signatures, the `--keep-human` flag preserves host read counts within the output matrices. Crucially, `--keep-human` can be combined concurrently with domain-specific filters, allowing the simultaneous evaluation of host-to-microbe or host-to-pathogen abundance ratios within a single run. ## Statistical Analysis + Following matrix generation, the statistical module computes normalization metrics and ecological indices directly: * **Relative Abundance:** Normalizes absolute counts into percentage distributions using the formula: $\text{Relative Abundance} = \left( \frac{\text{Number of individuals of taxa}}{\text{Total number of individuals of all taxa}} \right) \times 100$. A user-defined abundance threshold aggregates rare background taxa into a consolidated `Other` category to simplify downstream parsing and plotting. @@ -44,6 +46,7 @@ Following matrix generation, the statistical module computes normalization metri * **Beta Diversity:** Computes compositional dissimilarity between samples via *Bray-Curtis* [@bray10jt] and *Jaccard* [@jaccard1901etude] distance metrics, exporting standard distance matrices ready for ordination. ## Visualization + The `kpplot` module utilizes an object-oriented design inheriting from a unified base configuration class (`KpPlotBase`), enforcing consistent rendering properties such as DPI, bounding box scaling, and layout properties. Built on top of `matplotlib` [@Hunter2007], `pandas` [@reback2020pandas], and `seaborn` [@Waskom2021], the visualization engine exposes four primary programmatic layouts: * **Stacked Bar Plots:** For comparing relative taxonomic proportions across multi-sample cohorts. @@ -59,4 +62,4 @@ The functional reliability and execution integrity of `KrakenParser` are validat Generative AI tools were used during the development of this work to assist with code refactoring, documentation drafting, and manuscript text editing. All software design decisions, implementation, validation, and scientific interpretation were performed and reviewed by the authors. No generative AI tools were used to generate or analyze research data, and all results reported are reproducible from the publicly available source code and documentation. -# References \ No newline at end of file +# References