From bf0f182cb30504a8c409ba400f45b1a700c6f776 Mon Sep 17 00:00:00 2001
From: Ilia Popov <iljapopov17@gmail.com>
Date: Fri, 22 May 2026 15:45:57 +0200
Subject: [PATCH 1/9] build: exclude __main__ guard lines from coverage
 reporting

---
 pyproject.toml | 6 ++++++
 1 file changed, 6 insertions(+)
diff --git a/pyproject.toml b/pyproject.toml
index 4ae9bca..0d777b1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -48,3 +48,9 @@ omit = [
     "krakenparser/mpa/transform2mpa.py",
     "krakenparser/mpa/mpa_table.py",
 ]
+
+[tool.coverage.report]
+exclude_lines = [
+    "pragma: no cover",
+    "if __name__ == .__main__.:",
+]

From 3cedb3928dfa688ab7d16f21d6579d799a0e06a7 Mon Sep 17 00:00:00 2001
From: Ilia Popov <iljapopov17@gmail.com>
Date: Sun, 31 May 2026 16:46:07 +0200
Subject: [PATCH 2/9] refactor(docs): update CLI usage to use --step flag

---
 README.md | 126 ++++++++++++++++++++++++++++++------------------------
 1 file changed, 69 insertions(+), 57 deletions(-)

diff --git a/README.md b/README.md
index 898b34e..ecc02f1 100644
--- a/README.md
+++ b/README.md
@@ -107,15 +107,9 @@ X9,0.7232472324723247,0.7352941176470589,...,0.8066914498141264,0.0
 |![combined_white](https://github.com/user-attachments/assets/48b3f6e3-6dd5-4298-a793-23dcd549e90c)|![kpclust](https://github.com/user-attachments/assets/98a4d540-7c43-4802-8f77-277a5637a7a1)|
 
 ## Quick Start (Full Pipeline)
-To run the full pipeline, use the following command:
-```bash
-KrakenParser --complete -i data/kreports -o results/
-#Having troubles? Run KrakenParser --complete -h
-```
 
-For **reproducible** β-diversity (rarefaction is stochastic by default):
 ```bash
-KrakenParser -i data/kreports -o results/ -s 42
+KrakenParser -i data/kreports -o results/
 ```
 
 This will:
@@ -127,23 +121,46 @@ This will:
 6. Calculate relative abundance
 7. Calculate α & β-diversities
 
-## Installation
+> [!TIP]
+> After the pipeline finishes, the output window will remind you about calibrating
+> rarefaction depth for β-diversity and re-running relative abundance normalization
+> before visualization — with ready-to-paste example commands tailored to your output paths.
+
+### Full help output
 
 ```
-pip install krakenparser
+usage: KrakenParser [-h] [-i INPUT] [-o OUTPUT] [--viruses] [--keep-human]
+                    [-V] [-d DEPTH] [-s SEED] [--overwrite]
+                    [--step {mpa,combine,split,process,csv,relabund,diversity}]
+
+KrakenParser: Convert Kraken2 Reports to CSV.
+
+options:
+  -h, --help            show this help message and exit
+
+Core Arguments:
+  -i, --input INPUT     Directory containing Kraken2 report files
+  -o, --output OUTPUT   Output directory (default: parent of input)
+  --viruses             Extract only VIRUSES domain taxa in the pipeline
+  --keep-human          Do not filter human-related taxa
+  -V, --version         show program's version number and exit
+
+Pipeline Options (Full Run):
+  -d, --depth DEPTH     Rarefaction depth for β-diversity (default: 1000)
+  -s, --seed SEED       Random seed for reproducible rarefaction (default: random)
+  --overwrite           Overwrite the output directory if it already exists
+
+Advanced (Step-by-step control):
+  --step {mpa,combine,split,process,csv,relabund,diversity}
+                        Run only a specific part of the pipeline.
+                        Type 'krakenparser --step <name> -h' for more.
 ```
 
-## Before Visualization: Grouping Low-Abundance Taxa
-
-The full pipeline automatically calculates relative abundance. Before passing data to visualization, it is strongly recommended to re-run `--relabund` with the `-O` flag — this collapses all taxa below the chosen threshold into a single **"Other"** group, producing much cleaner and more readable plots.
+## Installation
 
-```bash
-KrakenParser --relabund -i data/counts/counts_species.csv -o data/rel_abund/ra_species.csv -O 4
 ```
-
-This groups every taxon with relative abundance **< 4 %** into `Other (<4.0%)`. Adjust the threshold to your data.
-
-> **Note:** The pipeline-generated `rel_abund/ra_*.csv` files (no `-O`) preserve the full unfiltered data — use them for statistical analysis. Use the `-O` variant specifically for visualization.
+pip install krakenparser
+```
 
 ---
 
@@ -151,123 +168,118 @@ This groups every taxon with relative abundance **< 4 %** into `Other (<4.0%)`.
 <summary><b>Using Individual Modules (Advanced)</b></summary>
 <br>
 
-Each step of the pipeline can also be run individually. This is useful for re-running a single step, debugging, or integrating KrakenParser into a custom workflow.
+Each step of the pipeline can be run individually via `--step`. This is useful for re-running a single step, debugging, or integrating KrakenParser into a custom workflow. Run `krakenparser --step <name> -h` to see the full argument list for any step.
 
 ### **Step 1: Convert Kraken2 Reports to MPA Format**
 ```bash
 # Batch mode (directory)
-KrakenParser --kreport2mpa -i data/kreports -o data/intermediate/mpa
+KrakenParser --step mpa -i data/kreports -o data/intermediate/mpa
 # Single file
-KrakenParser --kreport2mpa -r data/kreports/sample.kreport -o data/intermediate/mpa/sample.MPA.TXT
-#Having troubles? Run KrakenParser --kreport2mpa -h
+KrakenParser --step mpa -r data/kreports/sample.kreport -o data/intermediate/mpa/sample.MPA.TXT
 ```
 Converts Kraken2 `.kreport` files into **MPA format**.
 
 ### **Step 2: Combine MPA Files**
 ```bash
-KrakenParser --combine_mpa -i data/intermediate/mpa/* -o data/intermediate/COMBINED.txt
-#Having troubles? Run KrakenParser --combine_mpa -h
+KrakenParser --step combine -i data/intermediate/mpa/* -o data/intermediate/COMBINED.txt
 ```
 Merges multiple MPA files into a single combined table.
 
 ### **Step 3: Extract Taxonomic Levels**
 ```bash
-KrakenParser --deconstruct -i data/intermediate/COMBINED.txt -o data/intermediate
-#Having troubles? Run KrakenParser --deconstruct -h
+KrakenParser --step split -i data/intermediate/COMBINED.txt -o data/intermediate
 ```
 
 By default, human-related taxa (Homo sapiens, Hominidae, Primates, Mammalia, Chordata) are removed. To keep them:
 ```bash
-KrakenParser --deconstruct -i data/intermediate/COMBINED.txt -o data/intermediate --keep-human
+KrakenParser --step split -i data/intermediate/COMBINED.txt -o data/intermediate --keep-human
 ```
 
-To inspect the **Viruses** domain separately:
+To inspect the **Viruses** domain only:
 ```bash
-KrakenParser --deconstruct_viruses -i data/intermediate/COMBINED.txt -o data/counts_viruses
-#Having troubles? Run KrakenParser --deconstruct_viruses -h
+KrakenParser --step split -i data/intermediate/COMBINED.txt -o data/counts_viruses --viruses-only
 ```
 
 ### **Step 4: Process Extracted Taxonomic Data**
 ```bash
-KrakenParser --process -i data/intermediate/COMBINED.txt -o data/intermediate/txt/counts_phylum.txt
-#Having troubles? Run KrakenParser --process -h
+KrakenParser --step process -i data/intermediate/COMBINED.txt -o data/intermediate/txt/counts_phylum.txt
 ```
 
-Repeat on other 5 taxonomical levels (class, order, family, genus, species) or wrap up `KrakenParser --process` in a loop.
+Repeat on other 5 taxonomical levels (class, order, family, genus, species) or wrap `--step process` in a loop.
 
 Cleans up taxonomic names: removes prefixes (`s__`, `g__`, etc.) and replaces underscores with spaces.
 
 ### **Step 5: Convert TXT to CSV**
 ```bash
-KrakenParser --txt2csv -i data/intermediate/txt/counts_phylum.txt -o data/counts/counts_phylum.csv
-#Having troubles? Run KrakenParser --txt2csv -h
+KrakenParser --step csv -i data/intermediate/txt/counts_phylum.txt -o data/counts/counts_phylum.csv
 ```
 Repeat on other 5 taxonomical levels or wrap in a loop. Transposes data so that sample names become rows.
 
 ### **Step 6: Calculate Relative Abundance**
 ```bash
-KrakenParser --relabund -i data/counts/counts_phylum.csv -o data/rel_abund/ra_phylum.csv
-#Having troubles? Run KrakenParser --relabund -h
+KrakenParser --step relabund -i data/counts/counts_phylum.csv -o data/rel_abund/ra_phylum.csv
 ```
 Repeat on other 5 taxonomical levels or wrap in a loop.
 
 With "Other" grouping:
 ```bash
-KrakenParser --relabund -i data/counts/counts_phylum.csv -o data/rel_abund/ra_phylum.csv -O 3.5
+KrakenParser --step relabund -i data/counts/counts_phylum.csv -o data/rel_abund/ra_phylum.csv -O 3.5
 ```
 Groups all taxa with abundance < 3.5 % into `Other (<3.5%)`.
 
 ### **Step 7: Calculate α & β-Diversities**
 ```bash
-KrakenParser --diversity -i data/counts/counts_species.csv -o data/diversity
-#Having troubles? Run KrakenParser --diversity -h
+KrakenParser --step diversity -i data/counts/counts_species.csv -o data/diversity
 ```
 
 With a custom rarefaction depth:
 ```bash
-KrakenParser --diversity -i data/counts/counts_species.csv -o data/diversity -d 750
+KrakenParser --step diversity -i data/counts/counts_species.csv -o data/diversity -d 750
 ```
 
-For reproducible results (rarefaction uses random subsampling — fix the seed to get the same matrix every run):
+For reproducible results (fix the seed to get the same matrix every run):
 ```bash
-KrakenParser --diversity -i data/counts/counts_species.csv -o data/diversity -s 42
+KrakenParser --step diversity -i data/counts/counts_species.csv -o data/diversity -s 42
 ```
 
 ---
 
 ## Arguments Breakdown
 
-### **--complete** (Full Pipeline)
-- Requires `-i`: path to the Kraken2 reports directory (e.g., `data/kreports`).
-- Optional `-o`: output directory (default: parent of `-i`).
-- Optional `--keep-human`: retain human-related taxa (default: filtered out).
-- Optional `-s INT`: random seed for reproducible β-diversity rarefaction (default: random).
+### **Full Pipeline** (`-i`)
+- `-i / --input`: path to the Kraken2 reports directory (e.g., `data/kreports`). Triggers the full pipeline.
+- `-o / --output`: output directory (default: parent of `-i`).
+- `--viruses`: extract only Viruses domain taxa throughout the pipeline.
+- `--keep-human`: retain human-related taxa (default: filtered out).
+- `-d INT / --depth`: rarefaction depth for β-diversity (default: 1000).
+- `-s INT / --seed`: random seed for reproducible β-diversity rarefaction (default: random).
+- `--overwrite`: overwrite the output directory if it already exists.
 
-### **--kreport2mpa** (Step 1)
+### **--step mpa** (Step 1)
 - Batch mode: `-i DIR -o DIR` — converts all files in a directory.
 - Single-file mode: `-r FILE -o FILE`.
 
-### **--combine_mpa** (Step 2)
+### **--step combine** (Step 2)
 - `-i FILE [FILE ...]`: one or more MPA files.
 - `-o FILE`: output merged table.
 
-### **--deconstruct** & **--deconstruct_viruses** (Step 3)
+### **--step split** (Step 3)
 - Extracts **phylum, class, order, family, genus, species** into separate text files.
-- `--deconstruct` removes human-related reads by default; use `--keep-human` to retain them.
-- `--deconstruct_viruses` extracts only the Viruses domain.
+- Removes human-related reads by default; use `--keep-human` to retain them.
+- Use `--viruses-only` to extract only the Viruses domain.
 
-### **--process** (Step 4)
+### **--step process** (Step 4)
 - Removes prefixes (`s__`, `g__`, etc.), replaces underscores with spaces.
 - `-i`: COMBINED.txt (source for sample-name header); `-o`: target txt file.
 
-### **--txt2csv** (Step 5)
+### **--step csv** (Step 5)
 - Transposes a processed txt file into a CSV with sample names as rows.
 
-### **--relabund** (Step 6)
+### **--step relabund** (Step 6)
 - Calculates relative abundance from a total-counts CSV.
 - `-O FLOAT`: group taxa below FLOAT % into `Other (<FLOAT%)`.
 
-### **--diversity** (Step 7)
+### **--step diversity** (Step 7)
 - Shannon, Pielou & Chao1 for α-diversity.
 - Bray-Curtis & Jaccard for β-diversity.
 - `-d INT`: rarefaction depth for β-diversity (default: 1000).

From 00a381fa66d1bb27f728172fa83ec21bf1e3cd18 Mon Sep 17 00:00:00 2001
From: Ilia Popov <iljapopov17@gmail.com>
Date: Sun, 31 May 2026 16:47:32 +0200
Subject: [PATCH 3/9] refactor: modernize CLI routing and implement --step
 interface

---
 krakenparser/krakenparser.py | 255 ++++++++++++++++++++++++-----------
 1 file changed, 175 insertions(+), 80 deletions(-)

diff --git a/krakenparser/krakenparser.py b/krakenparser/krakenparser.py
index 28e94bf..d094aa0 100755
--- a/krakenparser/krakenparser.py
+++ b/krakenparser/krakenparser.py
@@ -12,119 +12,214 @@
     __version__ = "unknown"
 
 
-# Main function to run the tool
 def main():
     logging.basicConfig(level=logging.INFO, format="%(message)s")
-    print("KrakenParser by Ilia V. Popov")
-    # Set up argument parser
+    package_dir = Path(__file__).resolve().parent
+
+    # Map of advanced steps for granular pipeline execution control
+    step_map = {
+        "mpa": (package_dir / "mpa" / "transform2mpa.py", []),
+        "combine": (package_dir / "mpa" / "mpa_table.py", []),
+        "split": (package_dir / "counts" / "split_mpa.py", []),
+        "process": (package_dir / "counts" / "processing_script.py", []),
+        "csv": (package_dir / "counts" / "convert2csv.py", []),
+        "relabund": (package_dir / "stats" / "relabund.py", []),
+        "diversity": (package_dir / "stats" / "diversity.py", []),
+    }
+
+    def _build_cmd(
+        script: Path, base_args: list[str], user_args: list[str]
+    ) -> list[str]:
+        if script.suffix == ".py":
+            # Execute as module to preserve relative imports within the package
+            module = ".".join(
+                script.relative_to(package_dir.parent).with_suffix("").parts
+            )
+            return [sys.executable, "-m", module] + base_args + user_args
+        return [str(script)] + base_args + user_args
+
+    # -------------------------------------------------------------------------
+    # 1. Intercept --step execution for sub-module isolation
+    # -------------------------------------------------------------------------
+    if "--step" in sys.argv:
+        step_idx = sys.argv.index("--step")
+        if step_idx + 1 < len(sys.argv):
+            step = sys.argv[step_idx + 1]
+            if step in step_map:
+                script, base_args = step_map[step]
+                passed_args = sys.argv[1:]
+                passed_args.remove("--step")
+                passed_args.remove(step)
+
+                cmd = _build_cmd(script, base_args, passed_args)
+                sys.exit(subprocess.run(cmd).returncode)
+
+    # -------------------------------------------------------------------------
+    # 2. Main Argument Parser Definition
+    # -------------------------------------------------------------------------
     parser = argparse.ArgumentParser(
         description="KrakenParser: Convert Kraken2 Reports to CSV.",
-        add_help=False,
+        formatter_class=argparse.RawTextHelpFormatter,
     )
-    parser.add_argument(
-        "--complete",
-        action="store_true",
-        help="Run the full pipeline (also the default when -i is given)",
+
+    core_group = parser.add_argument_group("Core Arguments")
+    core_group.add_argument(
+        "-i", "--input", help="Directory containing Kraken2 report files"
     )
-    parser.add_argument(
-        "--kreport2mpa",
-        action="store_true",
-        help="Convert Kraken2 Reports to MPA Format",
+    core_group.add_argument(
+        "-o", "--output", help="Output directory (default: parent of input)"
     )
-    parser.add_argument(
-        "--combine_mpa",
+    core_group.add_argument(
+        "--viruses",
         action="store_true",
-        help="Combine MPA Files",
+        help="Extract only VIRUSES domain taxa in the pipeline",
     )
-    parser.add_argument(
-        "--deconstruct",
-        action="store_true",
-        help="Extract Taxonomic Levels from combined MPA file",
+    core_group.add_argument(
+        "--keep-human", action="store_true", help="Do not filter human-related taxa"
     )
-    parser.add_argument(
-        "--deconstruct_viruses",
-        action="store_true",
-        help="Extract Taxonomic Levels from combined MPA file using only VIRUSES domain",
+    core_group.add_argument(
+        "-V", "--version", action="version", version=f"%(prog)s {__version__}"
     )
-    parser.add_argument(
-        "--process",
-        action="store_true",
-        help="Process Extracted Taxonomic Data",
+
+    pipe_group = parser.add_argument_group("Pipeline Options (Full Run)")
+    pipe_group.add_argument(
+        "-d",
+        "--depth",
+        type=int,
+        default=1000,
+        help="Rarefaction depth for β-diversity (default: 1000)",
     )
-    parser.add_argument(
-        "--txt2csv",
-        action="store_true",
-        help="Convert TXT to CSV",
+    pipe_group.add_argument(
+        "-s",
+        "--seed",
+        type=int,
+        help="Random seed for reproducible rarefaction (default: random)",
     )
-    parser.add_argument(
-        "--relabund",
+    pipe_group.add_argument(
+        "--overwrite",
         action="store_true",
-        help="Calculate relative abundance",
+        help="Overwrite the output directory if it already exists",
     )
-    parser.add_argument(
-        "--diversity",
-        action="store_true",
-        help="Calculate α & β-diversities",
-    )
-    parser.add_argument(
-        "-V", "--version", action="version", version=f"%(prog)s {__version__}"
+
+    adv_group = parser.add_argument_group("Advanced (Step-by-step control)")
+    adv_group.add_argument(
+        "--step",
+        choices=list(step_map.keys()),
+        help="Run only a specific part of the pipeline.\nType 'krakenparser --step <name> -h' for more.",
     )
 
-    args, extra_args = parser.parse_known_args()
+    # Suppressed routing flags for strict backwards compatibility
+    legacy_flags = [
+        "--complete",
+        "--kreport2mpa",
+        "--combine_mpa",
+        "--deconstruct",
+        "--deconstruct_viruses",
+        "--process",
+        "--txt2csv",
+        "--relabund",
+        "--diversity",
+    ]
+    for flag in legacy_flags:
+        parser.add_argument(flag, action="store_true", help=argparse.SUPPRESS)
 
-    for _a in extra_args:
+    # -------------------------------------------------------------------------
+    # 3. Routing Logic and Validation
+    # -------------------------------------------------------------------------
+    for _a in sys.argv:
         if "\x00" in _a:
             sys.exit("Error: argument contains invalid null byte.")
 
-    package_dir = Path(__file__).resolve().parent  # Directory of the current script
+    args, unknown_args = parser.parse_known_args()
 
-    # Map flags to (script_path, base_args_to_prepend)
-    command_map = {
+    legacy_map = {
         "complete": (package_dir / "pipeline.py", []),
-        "kreport2mpa": (package_dir / "mpa" / "transform2mpa.py", []),
-        "combine_mpa": (package_dir / "mpa" / "mpa_table.py", []),
-        "deconstruct": (package_dir / "counts" / "split_mpa.py", []),
+        "kreport2mpa": step_map["mpa"],
+        "combine_mpa": step_map["combine"],
+        "deconstruct": step_map["split"],
         "deconstruct_viruses": (
             package_dir / "counts" / "split_mpa.py",
             ["--viruses-only"],
         ),
-        "process": (package_dir / "counts" / "processing_script.py", []),
-        "txt2csv": (package_dir / "counts" / "convert2csv.py", []),
-        "relabund": (package_dir / "stats" / "relabund.py", []),
-        "diversity": (package_dir / "stats" / "diversity.py", []),
+        "process": step_map["process"],
+        "txt2csv": step_map["csv"],
+        "relabund": step_map["relabund"],
+        "diversity": step_map["diversity"],
     }
 
-    if "-h" in sys.argv or "--help" in sys.argv:
-        if not any(getattr(args, key) for key in command_map):
-            parser.print_help()
-            return
+    passed_legacy_args = [
+        arg
+        for arg in sys.argv[1:]
+        if not arg.startswith("--") or arg.lstrip("--") not in legacy_map
+    ]
 
-    def _build_cmd(
-        script: Path, base_args: list[str], user_args: list[str]
-    ) -> list[str]:
-        if script.suffix == ".py":
-            # Run as module (-m) so the krakenparser package stays importable.
-            # Derive dotted module name from path relative to the package root.
-            module = ".".join(
-                script.relative_to(package_dir.parent).with_suffix("").parts
-            )
-            return [sys.executable, "-m", module] + base_args + user_args
-        return [str(script)] + base_args + user_args
+    for flag, (script, base_args) in legacy_map.items():
+        if getattr(args, flag, False):
+            cmd = _build_cmd(script, base_args, passed_legacy_args)
+            sys.exit(subprocess.run(cmd).returncode)
+
+    # Standard entry point: trigger pipeline execution if input directory is provided
+    if args.input:
+        script = package_dir / "pipeline.py"
+        cmd = _build_cmd(script, [], sys.argv[1:])
+
+        in_path = Path(args.input)
+        out_path = Path(args.output) if args.output else in_path.parent
+        out_path.mkdir(parents=True, exist_ok=True)
+        log_file_path = out_path / "krakenparser.log"
 
-    # Find which argument was given and run the corresponding script
-    for arg, (script, base_args) in command_map.items():
-        if getattr(args, arg):
-            subprocess.run(_build_cmd(script, base_args, extra_args), check=True)
-            return
+        with open(log_file_path, "w") as log_file:
+            result = subprocess.run(cmd, stdout=log_file, stderr=subprocess.STDOUT)
 
-    # Default to full pipeline when -i/--input is given without a subcommand
-    if "-i" in extra_args or "--input" in extra_args:
-        complete_script, complete_base = command_map["complete"]
-        subprocess.run(
-            _build_cmd(complete_script, complete_base, extra_args), check=True
-        )
-        return
+        if result.returncode == 0:
+            print("All steps completed successfully!")
+            print(f"Logs saved to {log_file_path}")
 
+            has_depth = any(arg in sys.argv for arg in ["-d", "--depth"])
+            has_seed = any(arg in sys.argv for arg in ["-s", "--seed"])
+
+            out_str = out_path.as_posix()
+
+            print("\n" + "=" * 95)
+
+            if not has_depth and not has_seed:
+                print(
+                    f"""
+[INFO] Pipeline completed using default rarefaction parameters (depth=1000, seed=random).
+       To calibrate beta-diversity sensitivity metrics for this specific dataset,
+       manually execute the diversity sub-module with custom thresholds.
+       Example:
+       krakenparser --step diversity \\
+       -i {out_str}/counts/counts_species.csv \\
+       -o {out_str}/diversity \\
+       --depth 1500 \\
+       --seed 42
+       """.rstrip()
+                )
+
+            print(
+                f"""
+[TIP] Downstream Data Visualization Prerequisite:
+      Relative abundance normalization is required to group low-abundance taxa
+      using the -O / --other <float> parameter. Without filtering the 'long tail'
+      of rare taxa, the resulting visualization will suffer from overplotting
+      and significant loss of interpretability.
+      Example:
+      krakenparser --step relabund \\
+      -i {out_str}/counts/counts_species.csv \\
+      -o {out_str}/rel_abund/counts_species_relabund_3_5.csv \\
+      -O 3.5
+
+{"=" * 95}
+            """.rstrip()
+            )
+        else:
+            print(f"Pipeline failed. Check logs at {log_file_path}")
+
+        sys.exit(result.returncode)
+
+    # Fallback to usage overview if no actionable arguments were provided
+    print("KrakenParser by Ilia V. Popov")
     parser.print_help()
 
 

From bea9762929521a8198c0eed885a9f327f854e83f Mon Sep 17 00:00:00 2001
From: Ilia Popov <iljapopov17@gmail.com>
Date: Sun, 31 May 2026 16:48:07 +0200
Subject: [PATCH 4/9] feat: add --viruses flag to full pipeline

---
 krakenparser/pipeline.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/krakenparser/pipeline.py b/krakenparser/pipeline.py
index 4f51e10..5bafa51 100644
--- a/krakenparser/pipeline.py
+++ b/krakenparser/pipeline.py
@@ -41,6 +41,7 @@ def run_pipeline(
     input_dir: str,
     output_dir: str | None = None,
     keep_human: bool = False,
+    viruses_only: bool = False,
     rarefaction_depth: int = 1000,
     seed: int | None = None,
     overwrite: bool = False,
@@ -88,7 +89,12 @@ def run_pipeline(
     _log.info("MPA files combined. Output: %s", combined_file)
 
     # Part 3: split combined MPA by rank
-    split_mpa(str(combined_file), str(intermediate_dir), keep_human=keep_human)
+    split_mpa(
+        str(combined_file),
+        str(intermediate_dir),
+        keep_human=keep_human,
+        viruses_only=viruses_only,
+    )
     txt_dir = intermediate_dir / "txt"
 
     # Part 4: clean taxa names and add sample header
@@ -143,6 +149,12 @@ def main() -> None:
         default=False,
         help="Do not filter human-related taxa (default: filtered)",
     )
+    parser.add_argument(
+        "--viruses",
+        action="store_true",
+        default=False,
+        help="Extract only Viruses domain taxa in the pipeline",
+    )
     parser.add_argument(
         "-d",
         "--depth",
@@ -169,6 +181,7 @@ def main() -> None:
             args.input,
             args.output,
             keep_human=args.keep_human,
+            viruses_only=args.viruses,
             rarefaction_depth=args.depth,
             seed=args.seed,
             overwrite=args.overwrite,

From e15d1efc076abd7cf61b32e494043072627b2c7b Mon Sep 17 00:00:00 2001
From: Ilia Popov <iljapopov17@gmail.com>
Date: Sun, 31 May 2026 16:48:52 +0200
Subject: [PATCH 5/9] refactor(mpa): implement hierarchical sorting for
 combined MPA tables

---
 krakenparser/mpa/mpa_table.py | 71 ++++++++++++++++++++++++++---------
 1 file changed, 53 insertions(+), 18 deletions(-)

diff --git a/krakenparser/mpa/mpa_table.py b/krakenparser/mpa/mpa_table.py
index 2837972..a7cb717 100644
--- a/krakenparser/mpa/mpa_table.py
+++ b/krakenparser/mpa/mpa_table.py
@@ -12,9 +12,12 @@
 
 def combine_mpa(in_files: list[str], o_file: str) -> None:
     out_path = ensure_output_dir(o_file, is_file=True)
-    # Plain dict preserves insertion order (Python 3.7+).
-    taxa: dict[str, dict[int, str]] = {}
-    sample_names: list[str] = []
+
+    samples: dict[int, str] = {}
+    values: dict[str, dict[int, str]] = {}
+    parent2child: dict[str, list[str]] = {}
+    toparse: list[str] = []
+    sample_count = 0
 
     _log.info("Number of files to parse: %d", len(in_files))
 
@@ -22,8 +25,10 @@ def combine_mpa(in_files: list[str], o_file: str) -> None:
         if not Path(in_path).is_file():
             raise FileNotFoundError(f"Input file not found: {in_path}")
 
-    for idx, in_path in enumerate(in_files):
-        sample_name = f"Sample #{idx + 1}"
+    for in_path in in_files:
+        sample_count += 1
+        sample_name = f"Sample #{sample_count}"
+
         with open(in_path) as fh:
             for line in fh:
                 line = line.rstrip("\n")
@@ -34,26 +39,56 @@ def combine_mpa(in_files: list[str], o_file: str) -> None:
                     if len(cols) >= 2:
                         sample_name = cols[-1]
                     continue
+
                 cols = line.split("\t", 1)
                 if len(cols) < 2:
                     continue
-                taxon, count = cols[0], cols[1]
-                if taxon not in taxa:
-                    taxa[taxon] = {}
-                taxa[taxon][idx] = count
-        sample_names.append(sample_name)
-
-    n_samples = len(sample_names)
-    n_taxa = len(taxa)
+                classification, val = cols[0], cols[1]
+
+                split_vals = classification.split("|")
+                curr_parent = ""
+                for i in range(len(split_vals)):
+                    test_val = "|".join(split_vals[:i])  # при i=0 → ""
+                    if test_val in values:
+                        curr_parent = test_val
+
+                if curr_parent == "":
+                    if classification not in toparse:
+                        toparse.append(classification)
+                else:
+                    if curr_parent not in parent2child:
+                        parent2child[curr_parent] = []
+                    if classification not in parent2child[curr_parent]:
+                        parent2child[curr_parent].append(classification)
+
+                if classification not in values:
+                    values[classification] = {}
+                values[classification][sample_count] = val
+
+        samples[sample_count] = sample_name
+
+    n_taxa = len(values)
     _log.info("Number of classifications to write: %d", n_taxa)
 
+    count_written = 0
     with open(out_path, "w") as fh:
-        fh.write("#Classification\t" + "\t".join(sample_names) + "\n")
-        for taxon, counts in taxa.items():
-            row = [counts.get(i, "0") for i in range(n_samples)]
-            fh.write(taxon + "\t" + "\t".join(row) + "\n")
+        header = "#Classification\t" + "\t".join(
+            samples[i] for i in range(1, sample_count + 1)
+        )
+        fh.write(header + "\n")
+
+        stack = list(toparse)
+        while stack:
+            curr = stack.pop(0)
+            if curr in parent2child:
+                stack = parent2child[curr] + stack
+            row = "\t".join(
+                values[curr].get(i, "0") for i in range(1, sample_count + 1)
+            )
+            fh.write(curr + "\t" + row + "\n")
+            count_written += 1
 
-    _log.info("%d classifications written", n_taxa)
+    _log.info("%d classifications written", count_written)
 
 
 def main() -> None:

From e48266e848abfd9014441c39c9a9a58c1ba89331 Mon Sep 17 00:00:00 2001
From: Ilia Popov <iljapopov17@gmail.com>
Date: Sun, 31 May 2026 16:49:39 +0200
Subject: [PATCH 6/9] refactor(mpa): overhaul kreport parsing logic for robust
 hierarchy tracking

---
 krakenparser/mpa/transform2mpa.py | 142 +++++++++++++++++-------------
 1 file changed, 82 insertions(+), 60 deletions(-)

diff --git a/krakenparser/mpa/transform2mpa.py b/krakenparser/mpa/transform2mpa.py
index 8d362f7..cf0bdae 100644
--- a/krakenparser/mpa/transform2mpa.py
+++ b/krakenparser/mpa/transform2mpa.py
@@ -9,51 +9,65 @@
 
 from krakenparser.utils import ensure_output_dir
 
-# Maps Kraken2 single-letter rank codes to MPA prefixes
-_RANK_PREFIX = {
-    "D": "d",
-    "K": "k",
-    "P": "p",
-    "C": "c",
-    "O": "o",
-    "F": "f",
-    "G": "g",
-    "S": "s",
-}
-
 _log = logging.getLogger(__name__)
 
+_MAIN_LVLS = {"R", "K", "D", "P", "C", "O", "F", "G", "S"}
+
 
-def _parse_line(line: str):
+def _parse_line(line: str, remove_spaces: bool = False) -> list:
     """
     Parse one Kraken2 report line.
 
-    Standard format (6 columns):
-        pct  cum_reads  direct_reads  rank  taxid  name(indented)
-
-    Returns (name, depth, rank, cum_reads, pct) or None on malformed input.
+    Returns [name, level_num, level_type, all_reads, percents]
+    or empty list on malformed input.
     """
     parts = line.rstrip("\n").split("\t")
-    if len(parts) < 5:
-        return None
+    if len(parts) < 4:
+        return []
     try:
-        pct = float(parts[0])
-        cum_reads = int(parts[1])
+        int(parts[1])
     except ValueError:
-        return None
+        return []
 
-    rank = parts[3].strip()
-    name_field = parts[-1]  # always the last column regardless of format variant
+    try:
+        percents = float(parts[0])
+    except ValueError:
+        return []
+    all_reads = int(parts[1])
+
+    try:
+        int(parts[-3])
+        level_type = parts[-2].strip()
+        map_kuniq = {
+            "species": "S",
+            "genus": "G",
+            "family": "F",
+            "order": "O",
+            "class": "C",
+            "phylum": "P",
+            "superkingdom": "D",
+            "kingdom": "K",
+        }
+        if level_type not in map_kuniq:
+            level_type = "-"
+        else:
+            level_type = map_kuniq[level_type]
+    except ValueError:
+        level_type = parts[-3].strip()
 
-    depth = 0
-    for ch in name_field:
+    name = parts[-1]
+    spaces = 0
+    for ch in name:
         if ch == " ":
-            depth += 1
+            spaces += 1
         else:
             break
-    name = name_field.strip()
+    name = name.strip()
+    if remove_spaces:
+        name = name.replace(" ", "_")
 
-    return name, depth // 2, rank, cum_reads, pct
+    level_num = spaces / 2
+    return [name, level_num, level_type, all_reads, percents]
 
 
 def kreport_to_mpa(
@@ -67,54 +81,63 @@ def kreport_to_mpa(
     """
     Convert a single Kraken2 report to MPA format.
 
-    Uses a stack to track the current taxonomic path. Each entry is
-    (structural_depth, mpa_segment, is_standard_rank). When a node at
-    depth d is encountered, all stack entries with depth >= d are popped
-    before the new entry is pushed, keeping the path consistent.
+    Tracks the current taxonomic path via curr_path and prev_lvl_num,
+    popping the stack when moving back up the tree — exactly as the
+    original script does.
     """
     if not Path(report_path).is_file():
         raise FileNotFoundError(f"Input file not found: {report_path}")
     out_path = ensure_output_dir(output_path, is_file=True)
-    # Stack entries: (structural_depth, mpa_segment, is_standard_rank)
-    stack: list[tuple[int, str, bool]] = []
+
+    curr_path: list[str] = []
+    prev_lvl_num = -1
 
     with open(report_path) as r_fh, open(out_path, "w") as o_fh:
         if display_header:
             o_fh.write("#Classification\t" + os.path.basename(report_path) + "\n")
 
         for line in r_fh:
-            parsed = _parse_line(line)
-            if parsed is None:
+            report_vals = _parse_line(line, remove_spaces)
+            if len(report_vals) < 5:
                 continue
-            name, depth, rank, cum_reads, pct = parsed
 
-            # Skip unclassified and root — never appear in MPA output
-            if rank in ("U", "R"):
+            name, level_num, level_type, all_reads, percents = report_vals
+
+            # Пропускаем unclassified
+            if level_type == "U":
                 continue
 
-            # Strip numeric suffix to get base rank (e.g. "S1" → "S", "G2" → "G")
-            rank_base = rank.rstrip("0123456789")
-            is_standard = rank_base in _RANK_PREFIX and rank == rank_base
+            # Нормализуем тип уровня
+            if level_type not in _MAIN_LVLS:
+                level_type = "x"
+            elif level_type == "K":
+                level_type = "k"
+            elif level_type == "D":
+                level_type = "d"
 
-            if not is_standard and not include_intermediate:
-                continue
+            level_str = level_type.lower() + "__" + name
 
-            prefix = _RANK_PREFIX.get(rank_base, "x")
-            seg_name = name.replace(" ", "_") if remove_spaces else name
-            mpa_seg = f"{prefix}__{seg_name}"
+            if prev_lvl_num == -1:
+                prev_lvl_num = level_num
+                curr_path.append(level_str)
+                continue
 
-            # Trim stack to the current structural depth
-            while stack and stack[-1][0] >= depth:
-                stack.pop()
-            stack.append((depth, mpa_seg, is_standard))
+            while level_num != (prev_lvl_num + 1):
+                prev_lvl_num -= 1
+                curr_path.pop()
 
-            # Build the full MPA path; omit intermediate (x__) segments when not requested
-            path = "|".join(
-                seg for (_, seg, std) in stack if include_intermediate or std
-            )
+            if (level_type == "x" and include_intermediate) or level_type != "x":
+                ancestors = [
+                    seg
+                    for seg in curr_path
+                    if (seg[0] != "x" or include_intermediate) and seg[0] != "r"
+                ]
+                path = "|".join(ancestors + [level_str])
+                value = str(all_reads) if use_reads else str(percents)
+                o_fh.write(path + "\t" + value + "\n")
 
-            value = str(cum_reads) if use_reads else str(pct)
-            o_fh.write(f"{path}\t{value}\n")
+            curr_path.append(level_str)
+            prev_lvl_num = level_num
 
 
 def main() -> None:
@@ -137,7 +160,6 @@ def main() -> None:
         dest="input_dir",
         help="Input directory containing Kraken2 report files (batch mode)",
     )
-
     parser.add_argument(
         "-o",
         "--output",
@@ -212,7 +234,7 @@ def main() -> None:
                 continue
             out_name = f.name.replace(".kreport", ".MPA.TXT")
             kreport_to_mpa(str(f), str(output_dir / out_name), **kwargs)
-        _log.info(f"Converted to MPA successfully. Output stored in {output_dir}")
+        _log.info("Converted to MPA successfully. Output stored in %s", output_dir)
     else:
         kreport_to_mpa(args.r_file, args.o_file, **kwargs)
 

From 06eedbbe8ce77af3b0351cd09a251636e9c88880 Mon Sep 17 00:00:00 2001
From: Ilia Popov <iljapopov17@gmail.com>
Date: Sun, 31 May 2026 16:50:19 +0200
Subject: [PATCH 7/9] test: update unit tests for _parse_line return type
 changes

---
 tests/test_units.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_units.py b/tests/test_units.py
index 414d87d..598d307 100644
--- a/tests/test_units.py
+++ b/tests/test_units.py
@@ -43,15 +43,15 @@ def test_parse_line_intermediate_rank():
 
 
 def test_parse_line_too_few_columns():
-    assert _parse_line("50.00\t500000\n") is None
+    assert _parse_line("50.00\t500000\n") == []
 
 
 def test_parse_line_non_numeric_pct():
-    assert _parse_line("not_a_float\t500000\t0\tP\t1224\tBacteria\n") is None
+    assert _parse_line("not_a_float\t500000\t0\tP\t1224\tBacteria\n") == []
 
 
 def test_parse_line_non_numeric_reads():
-    assert _parse_line("50.00\tnot_int\t0\tP\t1224\tBacteria\n") is None
+    assert _parse_line("50.00\tnot_int\t0\tP\t1224\tBacteria\n") == []
 
 
 # ---------------------------------------------------------------------------

From 54bf8e7cdcc57ad0b276665d6270576f05c2f190 Mon Sep 17 00:00:00 2001
From: Ilia Popov <iljapopov17@gmail.com>
Date: Sun, 31 May 2026 16:53:52 +0200
Subject: [PATCH 8/9] refactor: improve CLI startup output visibility

---
 krakenparser/krakenparser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/krakenparser/krakenparser.py b/krakenparser/krakenparser.py
index d094aa0..523a3f5 100755
--- a/krakenparser/krakenparser.py
+++ b/krakenparser/krakenparser.py
@@ -13,6 +13,7 @@
 
 
 def main():
+    print("KrakenParser by Ilia V. Popov")
     logging.basicConfig(level=logging.INFO, format="%(message)s")
     package_dir = Path(__file__).resolve().parent
 
@@ -219,7 +220,6 @@ def _build_cmd(
         sys.exit(result.returncode)
 
     # Fallback to usage overview if no actionable arguments were provided
-    print("KrakenParser by Ilia V. Popov")
     parser.print_help()
 
 

From 766ce5243e8ca851ff7cc74d66fbeffe004f3faa Mon Sep 17 00:00:00 2001
From: Ilia Popov <iljapopov17@gmail.com>
Date: Sun, 31 May 2026 17:40:37 +0200
Subject: [PATCH 9/9] docs: update results directory structure in README

---
 README.md | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index ecc02f1..d96b894 100644
--- a/README.md
+++ b/README.md
@@ -305,16 +305,17 @@ results/
 │  ├─ alpha_div.csv
 │  ├─ beta_div_bray.csv
 │  └─ beta_div_jaccard.csv
-└─ intermediate/           # Intermediate files
-   ├─ mpa/                 # Converted MPA files
-   │  ├─ {sample}.txt
-   │  ├─ ...
-   ├─ COMBINED.txt         # Merged MPA table
-   └─ txt/                 # Extracted taxonomic levels in TXT
-      ├─ counts_species.txt
-      ├─ counts_genus.txt
-      ├─ ...
-      └─ counts_phylum.txt
+├─ intermediate/           # Intermediate files
+│  ├─ mpa/                 # Converted MPA files
+│  │  ├─ {sample}.txt
+│  │  ├─ ...
+│  ├─ COMBINED.txt         # Merged MPA table
+│  └─ txt/                 # Extracted taxonomic levels in TXT
+│     ├─ counts_species.txt
+│     ├─ counts_genus.txt
+│     ├─ ...
+│     └─ counts_phylum.txt
+└─ krakenparser.log         # Pipeline execution logs
 ```
 
 ## Conclusion