Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
58c25c0
docs(cli): add type annotations, docstrings, and comments to krakenpa…
iliapopov17 Jun 4, 2026
fa0a081
refactor(pipeline): harden pipeline orchestration with static type hi…
iliapopov17 Jun 4, 2026
e395625
refactor(utils): add explicit type annotations and docstrings to util…
iliapopov17 Jun 4, 2026
de461ff
refactor(init): add explicit type annotation to __all__ export sequence
iliapopov17 Jun 4, 2026
83f155a
refactor(counts): modernize matrix transposition utility with Path ty…
iliapopov17 Jun 4, 2026
7edb694
refactor(counts): add type annotations and docstrings to processing_s…
iliapopov17 Jun 4, 2026
3824525
refactor(counts): add type annotations and docstrings to split_mpa.py
iliapopov17 Jun 4, 2026
f190b3e
refactor(kpplot): expand sub-package initialization namespace and enf…
iliapopov17 Jun 4, 2026
8989fb2
refactor(kpplot): implement static type hinting and docstrings in plo…
iliapopov17 Jun 4, 2026
7073e86
refactor(kpplot): add strict literal constraints, type annotations, a…
iliapopov17 Jun 4, 2026
c8aea1f
refactor(kpplot): add strict literal constraints, type annotations, a…
iliapopov17 Jun 4, 2026
b5dcee4
refactor(kpplot): add strict literal constraints, type annotations, a…
iliapopov17 Jun 4, 2026
fe6464d
refactor(mpa): implement explicit type hints, and docstrings in mpa_t…
iliapopov17 Jun 4, 2026
0560267
refactor(mpa): add strict type annotations, context managers, and doc…
iliapopov17 Jun 4, 2026
ce218ae
refactor(stats): implement strict type hints, logging, and docstrings…
iliapopov17 Jun 4, 2026
1116b04
refactor(stats): implement vectorization, explicit type hints, and do…
iliapopov17 Jun 4, 2026
a1a29e4
refactor(tests): document fixture architecture and add mock configura…
iliapopov17 Jun 4, 2026
a639f50
refactor(tests): parameterize CLI error paths and clean up entrypoint…
iliapopov17 Jun 4, 2026
4e42422
refactor(tests): clarify end-to-end execution scope and clean up pipe…
iliapopov17 Jun 4, 2026
1a01140
refactor(tests): structure integration endpoints and contract checks …
iliapopov17 Jun 4, 2026
d087e77
refactor(tests): structure visualization layers, contract assertions,…
iliapopov17 Jun 4, 2026
583d6a8
refactor(tests): group logic flows and add descriptive assertions to …
iliapopov17 Jun 4, 2026
e8f0c9c
docs(paper): add disclosures section to paper.md
iliapopov17 Jun 4, 2026
dbf7836
chore(release): bump version to 1.1.2
iliapopov17 Jun 4, 2026
bcbe409
docs(paper): fix layout spacing in paper.md
iliapopov17 Jun 4, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion krakenparser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from .kpplot.stackedbar import stacked_barplot
from .kpplot.streamgraph import streamgraph

__all__ = [
__all__: list[str] = [
"stacked_barplot",
"streamgraph",
"clustermap",
Expand Down
62 changes: 42 additions & 20 deletions krakenparser/counts/convert2csv.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
#!/usr/bin/env python
#!/usr/bin/env python3
"""Matrix manipulation utility for restructuring metagenomic abundance tables.

This module converts tab-delimited abundance tables (traditionally structured
with features/taxa as rows and samples as columns) into standardized,
transposed CSV sheets conforming to the tidy data format (samples as rows).
"""

import logging
import sys
from pathlib import Path
from typing import Optional

Expand All @@ -9,40 +15,59 @@

from krakenparser.utils import ensure_output_dir

_log = logging.getLogger(__name__)
# Initialize module-level isolated logger
_log: logging.Logger = logging.getLogger(__name__)

app = typer.Typer(
# Dedicated Typer routing application instantiation
app: typer.Typer = typer.Typer(
name="csv",
add_completion=False,
context_settings={"help_option_names": ["-h", "--help"]},
)


def convert_to_csv(input_file: str, output_file: str) -> None:
in_path = Path(input_file)
if not in_path.is_file():
raise FileNotFoundError(f"Input file not found: {in_path}")
out_path = ensure_output_dir(output_file, is_file=True)
def convert_to_csv(input_file: Path, output_file: Path) -> None:
"""Transpose a tab-separated matrix and export it as a sample-centric CSV.

Reads a matrix where columns represent samples and rows represent taxa,
performs an algebraic transposition operation (.T), and locks the new row
index under the canonical 'Sample_id' header label.

Args:
input_file: Path to the validated incoming tab-separated matrix file.
output_file: Target path where the restructured CSV matrix will be dumped.

data = pd.read_csv(in_path, sep="\t", index_col=0)
Raises:
FileNotFoundError: Triggered if the specified input text resource is missing.
"""
if not input_file.is_file():
raise FileNotFoundError(f"Input file not found: {input_file}")

out_path: Path = ensure_output_dir(output_file, is_file=True)

# Load high-dimensional matrix (Rows: Taxa, Columns: Samples)
data: pd.DataFrame = pd.read_csv(input_file, sep="\t", index_col=0)

# Execute matrix transposition to shift samples to rows (Tidy Data layout)
data.T.to_csv(out_path, index_label="Sample_id")
_log.info("Data converted and saved as '%s'.", output_file)

_log.info("Data successfully transposed and saved to '%s'.", output_file)


@app.callback(invoke_without_command=True)
def main(
ctx: typer.Context,
input_file: Optional[str] = typer.Option(
input_file: Optional[Path] = typer.Option(
None,
"-i",
"--input",
help="Path to the input TXT file. This file should contain sample names in columns and microbial taxa in rows.",
help="Path to the input tab-delimited TXT file (samples in columns, taxa in rows).",
),
output_file: Optional[str] = typer.Option(
output_file: Optional[Path] = typer.Option(
None,
"-o",
"--output",
help="Path to the output CSV file. The script will restructure the data and save it here.",
help="Path to the output transposed CSV file.",
),
) -> None:
"""Reads a TXT file, reorganizes the data, and converts it into a CSV file."""
Expand All @@ -53,16 +78,13 @@ def main(
raise typer.Exit()

if not input_file or not output_file:
print(
"Error: Missing required options '-i / --input' and '-o / --output'.",
file=sys.stderr,
)
print("Error: Missing required options '-i / --input' and '-o / --output'.")
raise typer.Exit(code=1)

try:
convert_to_csv(input_file, output_file)
except FileNotFoundError as e:
print(f"Error: {e}", file=sys.stderr)
print(f"Error: {e}")
raise typer.Exit(code=1)


Expand Down
103 changes: 72 additions & 31 deletions krakenparser/counts/processing_script.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
#!/usr/bin/env python
#!/usr/bin/env python3
"""Post-processing matrix utility for metadata refinement and taxonomic sanitization.

This module cleans upstream pipeline artifacts by removing technical file extensions
from sample headers and restoring canonical spaces to underscore-separated taxonomic
nomenclature strings (e.g., converting 's__Escherichia_coli' to 'Escherichia coli').
File mutations are executed via atomic filesystem transactions.
"""

import logging
import os
Expand All @@ -9,73 +16,107 @@

import typer

_log = logging.getLogger(__name__)
# Initialize module-level isolated logger
_log: logging.Logger = logging.getLogger(__name__)

app = typer.Typer(
# Dedicated Typer routing application instantiation
app: typer.Typer = typer.Typer(
name="process",
add_completion=False,
context_settings={"help_option_names": ["-h", "--help"]},
)


def modify_taxa_names(line: str) -> str:
prefixes = ["s__", "g__", "f__", "o__", "c__", "p__"]
"""Sanitize taxonomic names by replacing internal underscores with spaces.

Scans the line for standard taxonomic rank prefixes (s__, g__, etc.). If found,
the primary taxon descriptor string is decoupled, sanitized of internal
technical underscores, and reconstructed while preserving tailing tab metrics.

Args:
line: A raw text row from the matrix containing taxonomic descriptors.

Returns:
str: The structurally preserved string with restored space characters.
"""
prefixes: list[str] = ["s__", "g__", "f__", "o__", "c__", "p__"]
for prefix in prefixes:
if line.startswith(prefix):
parts = line[len(prefix) :].split("\t")
# Clean string parsing utilizing standard tab separation matrices
parts: list[str] = line.removeprefix(prefix).split("\t")
parts[0] = parts[0].replace("_", " ")
return "\t".join(parts)
return line


def process_files(source_file: str, destination_file: str) -> None:
src_path = Path(source_file)
if not src_path.is_file():
raise FileNotFoundError(f"Source file not found: {src_path}")
dest_path = Path(destination_file)
if not dest_path.is_file():
raise FileNotFoundError(f"Destination file not found: {dest_path}")
def process_files(source_file: Path, destination_file: Path) -> None:
"""Synchronize matrix headers and sanitize taxonomic profiles atomically.

Extracts clean cohort descriptors from the header of a source tracker,
applies string cleaning to a targeted taxonomy mapping spreadsheet,
and updates the destination file utilizing atomic replacement blocks.

# Read the first line from the source file and modify it
with open(src_path, "r") as file:
first_line_source = file.readline()
modified_first_line = "\t".join(
Args:
source_file: Validated Path to the template matrix containing pristine headers.
destination_file: Target Path to the file undergoing line-by-line taxonomy cleaning.

Raises:
FileNotFoundError: Triggered if either the source or destination targets are absent.
"""
if not source_file.is_file():
raise FileNotFoundError(f"Source file not found: {source_file}")
if not destination_file.is_file():
raise FileNotFoundError(f"Destination file not found: {destination_file}")

# Step 1: Read and truncate raw pipeline suffixes from sample headers
with open(source_file, "r", encoding="utf-8") as file:
first_line_source: str = file.readline()

modified_first_line: str = "\t".join(
word.split(".")[0] for word in first_line_source.split()
)

# Read all content from the destination file and modify taxa names
with open(dest_path, "r") as file:
lines = file.readlines()
modified_lines = [modify_taxa_names(line.strip()) for line in lines]
# Step 2: Read targets and map taxonomic updates lazily across lists
with open(destination_file, "r", encoding="utf-8") as file:
lines: list[str] = file.readlines()

modified_lines: list[str] = [modify_taxa_names(line.strip()) for line in lines]

# Combine the modified first line with the modified content of the destination file
updated_content = modified_first_line + "\n" + "\n".join(modified_lines)
# Step 3: Integrate matrices and commit layout modifications to disk
joined_lines: str = "\n".join(modified_lines)
updated_content: str = f"{modified_first_line}\n{joined_lines}"

# Write atomically: write to a temp file in the same directory, then replace
# Secure atomic writer operations targeting adjacent scratch space regions
with tempfile.NamedTemporaryFile(
mode="w", dir=dest_path.parent, delete=False, suffix=".tmp"
mode="w",
dir=destination_file.parent,
delete=False,
suffix=".tmp",
encoding="utf-8",
) as tmp:
tmp.write(updated_content)
tmp_path = tmp.name
os.replace(tmp_path, dest_path)
tmp_path: str = tmp.name

_log.info(f"Processed {destination_file} successfully.")
# Commit transactions atomically across POSIX virtual environments
os.replace(tmp_path, destination_file)
_log.info("Processed '%s' successfully.", destination_file)


@app.callback(invoke_without_command=True)
def main(
ctx: typer.Context,
input_file: Optional[str] = typer.Option(
input_file: Optional[Path] = typer.Option(
None,
"-i",
"--input",
help="Path to the source file. This file's first line will be read and modified.",
help="Path to the source file (used to extract and truncate header labels).",
),
output_file: Optional[str] = typer.Option(
output_file: Optional[Path] = typer.Option(
None,
"-o",
"--output",
help="Path to the destination file. This file's contents will be updated with cleaned taxa names.",
help="Path to the destination matrix undergoing taxonomic name sanitation.",
),
) -> None:
"""Reads a source file, processes its first line, modifies taxa names in a destination file, and updates it."""
Expand Down
Loading
Loading