From bf0f182cb30504a8c409ba400f45b1a700c6f776 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Fri, 22 May 2026 15:45:57 +0200 Subject: [PATCH 1/9] build: exclude __main__ guard lines from coverage reporting --- pyproject.toml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 4ae9bca..0d777b1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,3 +48,9 @@ omit = [ "krakenparser/mpa/transform2mpa.py", "krakenparser/mpa/mpa_table.py", ] + +[tool.coverage.report] +exclude_lines = [ + "pragma: no cover", + "if __name__ == .__main__.:", +] From 3cedb3928dfa688ab7d16f21d6579d799a0e06a7 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Sun, 31 May 2026 16:46:07 +0200 Subject: [PATCH 2/9] refactor(docs): update CLI usage to use --step flag --- README.md | 126 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 69 insertions(+), 57 deletions(-) diff --git a/README.md b/README.md index 898b34e..ecc02f1 100644 --- a/README.md +++ b/README.md @@ -107,15 +107,9 @@ X9,0.7232472324723247,0.7352941176470589,...,0.8066914498141264,0.0 |![combined_white](https://github.com/user-attachments/assets/48b3f6e3-6dd5-4298-a793-23dcd549e90c)|![kpclust](https://github.com/user-attachments/assets/98a4d540-7c43-4802-8f77-277a5637a7a1)| ## Quick Start (Full Pipeline) -To run the full pipeline, use the following command: -```bash -KrakenParser --complete -i data/kreports -o results/ -#Having troubles? Run KrakenParser --complete -h -``` -For **reproducible** β-diversity (rarefaction is stochastic by default): ```bash -KrakenParser -i data/kreports -o results/ -s 42 +KrakenParser -i data/kreports -o results/ ``` This will: @@ -127,23 +121,46 @@ This will: 6. Calculate relative abundance 7. Calculate α & β-diversities -## Installation +> [!TIP] +> After the pipeline finishes, the output window will remind you about calibrating +> rarefaction depth for β-diversity and re-running relative abundance normalization +> before visualization — with ready-to-paste example commands tailored to your output paths. + +### Full help output ``` -pip install krakenparser +usage: KrakenParser [-h] [-i INPUT] [-o OUTPUT] [--viruses] [--keep-human] + [-V] [-d DEPTH] [-s SEED] [--overwrite] + [--step {mpa,combine,split,process,csv,relabund,diversity}] + +KrakenParser: Convert Kraken2 Reports to CSV. + +options: + -h, --help show this help message and exit + +Core Arguments: + -i, --input INPUT Directory containing Kraken2 report files + -o, --output OUTPUT Output directory (default: parent of input) + --viruses Extract only VIRUSES domain taxa in the pipeline + --keep-human Do not filter human-related taxa + -V, --version show program's version number and exit + +Pipeline Options (Full Run): + -d, --depth DEPTH Rarefaction depth for β-diversity (default: 1000) + -s, --seed SEED Random seed for reproducible rarefaction (default: random) + --overwrite Overwrite the output directory if it already exists + +Advanced (Step-by-step control): + --step {mpa,combine,split,process,csv,relabund,diversity} + Run only a specific part of the pipeline. + Type 'krakenparser --step -h' for more. ``` -## Before Visualization: Grouping Low-Abundance Taxa - -The full pipeline automatically calculates relative abundance. Before passing data to visualization, it is strongly recommended to re-run `--relabund` with the `-O` flag — this collapses all taxa below the chosen threshold into a single **"Other"** group, producing much cleaner and more readable plots. +## Installation -```bash -KrakenParser --relabund -i data/counts/counts_species.csv -o data/rel_abund/ra_species.csv -O 4 ``` - -This groups every taxon with relative abundance **< 4 %** into `Other (<4.0%)`. Adjust the threshold to your data. - -> **Note:** The pipeline-generated `rel_abund/ra_*.csv` files (no `-O`) preserve the full unfiltered data — use them for statistical analysis. Use the `-O` variant specifically for visualization. +pip install krakenparser +``` --- @@ -151,123 +168,118 @@ This groups every taxon with relative abundance **< 4 %** into `Other (<4.0%)`. Using Individual Modules (Advanced)
-Each step of the pipeline can also be run individually. This is useful for re-running a single step, debugging, or integrating KrakenParser into a custom workflow. +Each step of the pipeline can be run individually via `--step`. This is useful for re-running a single step, debugging, or integrating KrakenParser into a custom workflow. Run `krakenparser --step -h` to see the full argument list for any step. ### **Step 1: Convert Kraken2 Reports to MPA Format** ```bash # Batch mode (directory) -KrakenParser --kreport2mpa -i data/kreports -o data/intermediate/mpa +KrakenParser --step mpa -i data/kreports -o data/intermediate/mpa # Single file -KrakenParser --kreport2mpa -r data/kreports/sample.kreport -o data/intermediate/mpa/sample.MPA.TXT -#Having troubles? Run KrakenParser --kreport2mpa -h +KrakenParser --step mpa -r data/kreports/sample.kreport -o data/intermediate/mpa/sample.MPA.TXT ``` Converts Kraken2 `.kreport` files into **MPA format**. ### **Step 2: Combine MPA Files** ```bash -KrakenParser --combine_mpa -i data/intermediate/mpa/* -o data/intermediate/COMBINED.txt -#Having troubles? Run KrakenParser --combine_mpa -h +KrakenParser --step combine -i data/intermediate/mpa/* -o data/intermediate/COMBINED.txt ``` Merges multiple MPA files into a single combined table. ### **Step 3: Extract Taxonomic Levels** ```bash -KrakenParser --deconstruct -i data/intermediate/COMBINED.txt -o data/intermediate -#Having troubles? Run KrakenParser --deconstruct -h +KrakenParser --step split -i data/intermediate/COMBINED.txt -o data/intermediate ``` By default, human-related taxa (Homo sapiens, Hominidae, Primates, Mammalia, Chordata) are removed. To keep them: ```bash -KrakenParser --deconstruct -i data/intermediate/COMBINED.txt -o data/intermediate --keep-human +KrakenParser --step split -i data/intermediate/COMBINED.txt -o data/intermediate --keep-human ``` -To inspect the **Viruses** domain separately: +To inspect the **Viruses** domain only: ```bash -KrakenParser --deconstruct_viruses -i data/intermediate/COMBINED.txt -o data/counts_viruses -#Having troubles? Run KrakenParser --deconstruct_viruses -h +KrakenParser --step split -i data/intermediate/COMBINED.txt -o data/counts_viruses --viruses-only ``` ### **Step 4: Process Extracted Taxonomic Data** ```bash -KrakenParser --process -i data/intermediate/COMBINED.txt -o data/intermediate/txt/counts_phylum.txt -#Having troubles? Run KrakenParser --process -h +KrakenParser --step process -i data/intermediate/COMBINED.txt -o data/intermediate/txt/counts_phylum.txt ``` -Repeat on other 5 taxonomical levels (class, order, family, genus, species) or wrap up `KrakenParser --process` in a loop. +Repeat on other 5 taxonomical levels (class, order, family, genus, species) or wrap `--step process` in a loop. Cleans up taxonomic names: removes prefixes (`s__`, `g__`, etc.) and replaces underscores with spaces. ### **Step 5: Convert TXT to CSV** ```bash -KrakenParser --txt2csv -i data/intermediate/txt/counts_phylum.txt -o data/counts/counts_phylum.csv -#Having troubles? Run KrakenParser --txt2csv -h +KrakenParser --step csv -i data/intermediate/txt/counts_phylum.txt -o data/counts/counts_phylum.csv ``` Repeat on other 5 taxonomical levels or wrap in a loop. Transposes data so that sample names become rows. ### **Step 6: Calculate Relative Abundance** ```bash -KrakenParser --relabund -i data/counts/counts_phylum.csv -o data/rel_abund/ra_phylum.csv -#Having troubles? Run KrakenParser --relabund -h +KrakenParser --step relabund -i data/counts/counts_phylum.csv -o data/rel_abund/ra_phylum.csv ``` Repeat on other 5 taxonomical levels or wrap in a loop. With "Other" grouping: ```bash -KrakenParser --relabund -i data/counts/counts_phylum.csv -o data/rel_abund/ra_phylum.csv -O 3.5 +KrakenParser --step relabund -i data/counts/counts_phylum.csv -o data/rel_abund/ra_phylum.csv -O 3.5 ``` Groups all taxa with abundance < 3.5 % into `Other (<3.5%)`. ### **Step 7: Calculate α & β-Diversities** ```bash -KrakenParser --diversity -i data/counts/counts_species.csv -o data/diversity -#Having troubles? Run KrakenParser --diversity -h +KrakenParser --step diversity -i data/counts/counts_species.csv -o data/diversity ``` With a custom rarefaction depth: ```bash -KrakenParser --diversity -i data/counts/counts_species.csv -o data/diversity -d 750 +KrakenParser --step diversity -i data/counts/counts_species.csv -o data/diversity -d 750 ``` -For reproducible results (rarefaction uses random subsampling — fix the seed to get the same matrix every run): +For reproducible results (fix the seed to get the same matrix every run): ```bash -KrakenParser --diversity -i data/counts/counts_species.csv -o data/diversity -s 42 +KrakenParser --step diversity -i data/counts/counts_species.csv -o data/diversity -s 42 ``` --- ## Arguments Breakdown -### **--complete** (Full Pipeline) -- Requires `-i`: path to the Kraken2 reports directory (e.g., `data/kreports`). -- Optional `-o`: output directory (default: parent of `-i`). -- Optional `--keep-human`: retain human-related taxa (default: filtered out). -- Optional `-s INT`: random seed for reproducible β-diversity rarefaction (default: random). +### **Full Pipeline** (`-i`) +- `-i / --input`: path to the Kraken2 reports directory (e.g., `data/kreports`). Triggers the full pipeline. +- `-o / --output`: output directory (default: parent of `-i`). +- `--viruses`: extract only Viruses domain taxa throughout the pipeline. +- `--keep-human`: retain human-related taxa (default: filtered out). +- `-d INT / --depth`: rarefaction depth for β-diversity (default: 1000). +- `-s INT / --seed`: random seed for reproducible β-diversity rarefaction (default: random). +- `--overwrite`: overwrite the output directory if it already exists. -### **--kreport2mpa** (Step 1) +### **--step mpa** (Step 1) - Batch mode: `-i DIR -o DIR` — converts all files in a directory. - Single-file mode: `-r FILE -o FILE`. -### **--combine_mpa** (Step 2) +### **--step combine** (Step 2) - `-i FILE [FILE ...]`: one or more MPA files. - `-o FILE`: output merged table. -### **--deconstruct** & **--deconstruct_viruses** (Step 3) +### **--step split** (Step 3) - Extracts **phylum, class, order, family, genus, species** into separate text files. -- `--deconstruct` removes human-related reads by default; use `--keep-human` to retain them. -- `--deconstruct_viruses` extracts only the Viruses domain. +- Removes human-related reads by default; use `--keep-human` to retain them. +- Use `--viruses-only` to extract only the Viruses domain. -### **--process** (Step 4) +### **--step process** (Step 4) - Removes prefixes (`s__`, `g__`, etc.), replaces underscores with spaces. - `-i`: COMBINED.txt (source for sample-name header); `-o`: target txt file. -### **--txt2csv** (Step 5) +### **--step csv** (Step 5) - Transposes a processed txt file into a CSV with sample names as rows. -### **--relabund** (Step 6) +### **--step relabund** (Step 6) - Calculates relative abundance from a total-counts CSV. - `-O FLOAT`: group taxa below FLOAT % into `Other ( Date: Sun, 31 May 2026 16:47:32 +0200 Subject: [PATCH 3/9] refactor: modernize CLI routing and implement --step interface --- krakenparser/krakenparser.py | 255 ++++++++++++++++++++++++----------- 1 file changed, 175 insertions(+), 80 deletions(-) diff --git a/krakenparser/krakenparser.py b/krakenparser/krakenparser.py index 28e94bf..d094aa0 100755 --- a/krakenparser/krakenparser.py +++ b/krakenparser/krakenparser.py @@ -12,119 +12,214 @@ __version__ = "unknown" -# Main function to run the tool def main(): logging.basicConfig(level=logging.INFO, format="%(message)s") - print("KrakenParser by Ilia V. Popov") - # Set up argument parser + package_dir = Path(__file__).resolve().parent + + # Map of advanced steps for granular pipeline execution control + step_map = { + "mpa": (package_dir / "mpa" / "transform2mpa.py", []), + "combine": (package_dir / "mpa" / "mpa_table.py", []), + "split": (package_dir / "counts" / "split_mpa.py", []), + "process": (package_dir / "counts" / "processing_script.py", []), + "csv": (package_dir / "counts" / "convert2csv.py", []), + "relabund": (package_dir / "stats" / "relabund.py", []), + "diversity": (package_dir / "stats" / "diversity.py", []), + } + + def _build_cmd( + script: Path, base_args: list[str], user_args: list[str] + ) -> list[str]: + if script.suffix == ".py": + # Execute as module to preserve relative imports within the package + module = ".".join( + script.relative_to(package_dir.parent).with_suffix("").parts + ) + return [sys.executable, "-m", module] + base_args + user_args + return [str(script)] + base_args + user_args + + # ------------------------------------------------------------------------- + # 1. Intercept --step execution for sub-module isolation + # ------------------------------------------------------------------------- + if "--step" in sys.argv: + step_idx = sys.argv.index("--step") + if step_idx + 1 < len(sys.argv): + step = sys.argv[step_idx + 1] + if step in step_map: + script, base_args = step_map[step] + passed_args = sys.argv[1:] + passed_args.remove("--step") + passed_args.remove(step) + + cmd = _build_cmd(script, base_args, passed_args) + sys.exit(subprocess.run(cmd).returncode) + + # ------------------------------------------------------------------------- + # 2. Main Argument Parser Definition + # ------------------------------------------------------------------------- parser = argparse.ArgumentParser( description="KrakenParser: Convert Kraken2 Reports to CSV.", - add_help=False, + formatter_class=argparse.RawTextHelpFormatter, ) - parser.add_argument( - "--complete", - action="store_true", - help="Run the full pipeline (also the default when -i is given)", + + core_group = parser.add_argument_group("Core Arguments") + core_group.add_argument( + "-i", "--input", help="Directory containing Kraken2 report files" ) - parser.add_argument( - "--kreport2mpa", - action="store_true", - help="Convert Kraken2 Reports to MPA Format", + core_group.add_argument( + "-o", "--output", help="Output directory (default: parent of input)" ) - parser.add_argument( - "--combine_mpa", + core_group.add_argument( + "--viruses", action="store_true", - help="Combine MPA Files", + help="Extract only VIRUSES domain taxa in the pipeline", ) - parser.add_argument( - "--deconstruct", - action="store_true", - help="Extract Taxonomic Levels from combined MPA file", + core_group.add_argument( + "--keep-human", action="store_true", help="Do not filter human-related taxa" ) - parser.add_argument( - "--deconstruct_viruses", - action="store_true", - help="Extract Taxonomic Levels from combined MPA file using only VIRUSES domain", + core_group.add_argument( + "-V", "--version", action="version", version=f"%(prog)s {__version__}" ) - parser.add_argument( - "--process", - action="store_true", - help="Process Extracted Taxonomic Data", + + pipe_group = parser.add_argument_group("Pipeline Options (Full Run)") + pipe_group.add_argument( + "-d", + "--depth", + type=int, + default=1000, + help="Rarefaction depth for β-diversity (default: 1000)", ) - parser.add_argument( - "--txt2csv", - action="store_true", - help="Convert TXT to CSV", + pipe_group.add_argument( + "-s", + "--seed", + type=int, + help="Random seed for reproducible rarefaction (default: random)", ) - parser.add_argument( - "--relabund", + pipe_group.add_argument( + "--overwrite", action="store_true", - help="Calculate relative abundance", + help="Overwrite the output directory if it already exists", ) - parser.add_argument( - "--diversity", - action="store_true", - help="Calculate α & β-diversities", - ) - parser.add_argument( - "-V", "--version", action="version", version=f"%(prog)s {__version__}" + + adv_group = parser.add_argument_group("Advanced (Step-by-step control)") + adv_group.add_argument( + "--step", + choices=list(step_map.keys()), + help="Run only a specific part of the pipeline.\nType 'krakenparser --step -h' for more.", ) - args, extra_args = parser.parse_known_args() + # Suppressed routing flags for strict backwards compatibility + legacy_flags = [ + "--complete", + "--kreport2mpa", + "--combine_mpa", + "--deconstruct", + "--deconstruct_viruses", + "--process", + "--txt2csv", + "--relabund", + "--diversity", + ] + for flag in legacy_flags: + parser.add_argument(flag, action="store_true", help=argparse.SUPPRESS) - for _a in extra_args: + # ------------------------------------------------------------------------- + # 3. Routing Logic and Validation + # ------------------------------------------------------------------------- + for _a in sys.argv: if "\x00" in _a: sys.exit("Error: argument contains invalid null byte.") - package_dir = Path(__file__).resolve().parent # Directory of the current script + args, unknown_args = parser.parse_known_args() - # Map flags to (script_path, base_args_to_prepend) - command_map = { + legacy_map = { "complete": (package_dir / "pipeline.py", []), - "kreport2mpa": (package_dir / "mpa" / "transform2mpa.py", []), - "combine_mpa": (package_dir / "mpa" / "mpa_table.py", []), - "deconstruct": (package_dir / "counts" / "split_mpa.py", []), + "kreport2mpa": step_map["mpa"], + "combine_mpa": step_map["combine"], + "deconstruct": step_map["split"], "deconstruct_viruses": ( package_dir / "counts" / "split_mpa.py", ["--viruses-only"], ), - "process": (package_dir / "counts" / "processing_script.py", []), - "txt2csv": (package_dir / "counts" / "convert2csv.py", []), - "relabund": (package_dir / "stats" / "relabund.py", []), - "diversity": (package_dir / "stats" / "diversity.py", []), + "process": step_map["process"], + "txt2csv": step_map["csv"], + "relabund": step_map["relabund"], + "diversity": step_map["diversity"], } - if "-h" in sys.argv or "--help" in sys.argv: - if not any(getattr(args, key) for key in command_map): - parser.print_help() - return + passed_legacy_args = [ + arg + for arg in sys.argv[1:] + if not arg.startswith("--") or arg.lstrip("--") not in legacy_map + ] - def _build_cmd( - script: Path, base_args: list[str], user_args: list[str] - ) -> list[str]: - if script.suffix == ".py": - # Run as module (-m) so the krakenparser package stays importable. - # Derive dotted module name from path relative to the package root. - module = ".".join( - script.relative_to(package_dir.parent).with_suffix("").parts - ) - return [sys.executable, "-m", module] + base_args + user_args - return [str(script)] + base_args + user_args + for flag, (script, base_args) in legacy_map.items(): + if getattr(args, flag, False): + cmd = _build_cmd(script, base_args, passed_legacy_args) + sys.exit(subprocess.run(cmd).returncode) + + # Standard entry point: trigger pipeline execution if input directory is provided + if args.input: + script = package_dir / "pipeline.py" + cmd = _build_cmd(script, [], sys.argv[1:]) + + in_path = Path(args.input) + out_path = Path(args.output) if args.output else in_path.parent + out_path.mkdir(parents=True, exist_ok=True) + log_file_path = out_path / "krakenparser.log" - # Find which argument was given and run the corresponding script - for arg, (script, base_args) in command_map.items(): - if getattr(args, arg): - subprocess.run(_build_cmd(script, base_args, extra_args), check=True) - return + with open(log_file_path, "w") as log_file: + result = subprocess.run(cmd, stdout=log_file, stderr=subprocess.STDOUT) - # Default to full pipeline when -i/--input is given without a subcommand - if "-i" in extra_args or "--input" in extra_args: - complete_script, complete_base = command_map["complete"] - subprocess.run( - _build_cmd(complete_script, complete_base, extra_args), check=True - ) - return + if result.returncode == 0: + print("All steps completed successfully!") + print(f"Logs saved to {log_file_path}") + has_depth = any(arg in sys.argv for arg in ["-d", "--depth"]) + has_seed = any(arg in sys.argv for arg in ["-s", "--seed"]) + + out_str = out_path.as_posix() + + print("\n" + "=" * 95) + + if not has_depth and not has_seed: + print( + f""" +[INFO] Pipeline completed using default rarefaction parameters (depth=1000, seed=random). + To calibrate beta-diversity sensitivity metrics for this specific dataset, + manually execute the diversity sub-module with custom thresholds. + Example: + krakenparser --step diversity \\ + -i {out_str}/counts/counts_species.csv \\ + -o {out_str}/diversity \\ + --depth 1500 \\ + --seed 42 + """.rstrip() + ) + + print( + f""" +[TIP] Downstream Data Visualization Prerequisite: + Relative abundance normalization is required to group low-abundance taxa + using the -O / --other parameter. Without filtering the 'long tail' + of rare taxa, the resulting visualization will suffer from overplotting + and significant loss of interpretability. + Example: + krakenparser --step relabund \\ + -i {out_str}/counts/counts_species.csv \\ + -o {out_str}/rel_abund/counts_species_relabund_3_5.csv \\ + -O 3.5 + +{"=" * 95} + """.rstrip() + ) + else: + print(f"Pipeline failed. Check logs at {log_file_path}") + + sys.exit(result.returncode) + + # Fallback to usage overview if no actionable arguments were provided + print("KrakenParser by Ilia V. Popov") parser.print_help() From bea9762929521a8198c0eed885a9f327f854e83f Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Sun, 31 May 2026 16:48:07 +0200 Subject: [PATCH 4/9] feat: add --viruses flag to full pipeline --- krakenparser/pipeline.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/krakenparser/pipeline.py b/krakenparser/pipeline.py index 4f51e10..5bafa51 100644 --- a/krakenparser/pipeline.py +++ b/krakenparser/pipeline.py @@ -41,6 +41,7 @@ def run_pipeline( input_dir: str, output_dir: str | None = None, keep_human: bool = False, + viruses_only: bool = False, rarefaction_depth: int = 1000, seed: int | None = None, overwrite: bool = False, @@ -88,7 +89,12 @@ def run_pipeline( _log.info("MPA files combined. Output: %s", combined_file) # Part 3: split combined MPA by rank - split_mpa(str(combined_file), str(intermediate_dir), keep_human=keep_human) + split_mpa( + str(combined_file), + str(intermediate_dir), + keep_human=keep_human, + viruses_only=viruses_only, + ) txt_dir = intermediate_dir / "txt" # Part 4: clean taxa names and add sample header @@ -143,6 +149,12 @@ def main() -> None: default=False, help="Do not filter human-related taxa (default: filtered)", ) + parser.add_argument( + "--viruses", + action="store_true", + default=False, + help="Extract only Viruses domain taxa in the pipeline", + ) parser.add_argument( "-d", "--depth", @@ -169,6 +181,7 @@ def main() -> None: args.input, args.output, keep_human=args.keep_human, + viruses_only=args.viruses, rarefaction_depth=args.depth, seed=args.seed, overwrite=args.overwrite, From e15d1efc076abd7cf61b32e494043072627b2c7b Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Sun, 31 May 2026 16:48:52 +0200 Subject: [PATCH 5/9] refactor(mpa): implement hierarchical sorting for combined MPA tables --- krakenparser/mpa/mpa_table.py | 71 ++++++++++++++++++++++++++--------- 1 file changed, 53 insertions(+), 18 deletions(-) diff --git a/krakenparser/mpa/mpa_table.py b/krakenparser/mpa/mpa_table.py index 2837972..a7cb717 100644 --- a/krakenparser/mpa/mpa_table.py +++ b/krakenparser/mpa/mpa_table.py @@ -12,9 +12,12 @@ def combine_mpa(in_files: list[str], o_file: str) -> None: out_path = ensure_output_dir(o_file, is_file=True) - # Plain dict preserves insertion order (Python 3.7+). - taxa: dict[str, dict[int, str]] = {} - sample_names: list[str] = [] + + samples: dict[int, str] = {} + values: dict[str, dict[int, str]] = {} + parent2child: dict[str, list[str]] = {} + toparse: list[str] = [] + sample_count = 0 _log.info("Number of files to parse: %d", len(in_files)) @@ -22,8 +25,10 @@ def combine_mpa(in_files: list[str], o_file: str) -> None: if not Path(in_path).is_file(): raise FileNotFoundError(f"Input file not found: {in_path}") - for idx, in_path in enumerate(in_files): - sample_name = f"Sample #{idx + 1}" + for in_path in in_files: + sample_count += 1 + sample_name = f"Sample #{sample_count}" + with open(in_path) as fh: for line in fh: line = line.rstrip("\n") @@ -34,26 +39,56 @@ def combine_mpa(in_files: list[str], o_file: str) -> None: if len(cols) >= 2: sample_name = cols[-1] continue + cols = line.split("\t", 1) if len(cols) < 2: continue - taxon, count = cols[0], cols[1] - if taxon not in taxa: - taxa[taxon] = {} - taxa[taxon][idx] = count - sample_names.append(sample_name) - - n_samples = len(sample_names) - n_taxa = len(taxa) + classification, val = cols[0], cols[1] + + split_vals = classification.split("|") + curr_parent = "" + for i in range(len(split_vals)): + test_val = "|".join(split_vals[:i]) # при i=0 → "" + if test_val in values: + curr_parent = test_val + + if curr_parent == "": + if classification not in toparse: + toparse.append(classification) + else: + if curr_parent not in parent2child: + parent2child[curr_parent] = [] + if classification not in parent2child[curr_parent]: + parent2child[curr_parent].append(classification) + + if classification not in values: + values[classification] = {} + values[classification][sample_count] = val + + samples[sample_count] = sample_name + + n_taxa = len(values) _log.info("Number of classifications to write: %d", n_taxa) + count_written = 0 with open(out_path, "w") as fh: - fh.write("#Classification\t" + "\t".join(sample_names) + "\n") - for taxon, counts in taxa.items(): - row = [counts.get(i, "0") for i in range(n_samples)] - fh.write(taxon + "\t" + "\t".join(row) + "\n") + header = "#Classification\t" + "\t".join( + samples[i] for i in range(1, sample_count + 1) + ) + fh.write(header + "\n") + + stack = list(toparse) + while stack: + curr = stack.pop(0) + if curr in parent2child: + stack = parent2child[curr] + stack + row = "\t".join( + values[curr].get(i, "0") for i in range(1, sample_count + 1) + ) + fh.write(curr + "\t" + row + "\n") + count_written += 1 - _log.info("%d classifications written", n_taxa) + _log.info("%d classifications written", count_written) def main() -> None: From e48266e848abfd9014441c39c9a9a58c1ba89331 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Sun, 31 May 2026 16:49:39 +0200 Subject: [PATCH 6/9] refactor(mpa): overhaul kreport parsing logic for robust hierarchy tracking --- krakenparser/mpa/transform2mpa.py | 142 +++++++++++++++++------------- 1 file changed, 82 insertions(+), 60 deletions(-) diff --git a/krakenparser/mpa/transform2mpa.py b/krakenparser/mpa/transform2mpa.py index 8d362f7..cf0bdae 100644 --- a/krakenparser/mpa/transform2mpa.py +++ b/krakenparser/mpa/transform2mpa.py @@ -9,51 +9,65 @@ from krakenparser.utils import ensure_output_dir -# Maps Kraken2 single-letter rank codes to MPA prefixes -_RANK_PREFIX = { - "D": "d", - "K": "k", - "P": "p", - "C": "c", - "O": "o", - "F": "f", - "G": "g", - "S": "s", -} - _log = logging.getLogger(__name__) +_MAIN_LVLS = {"R", "K", "D", "P", "C", "O", "F", "G", "S"} + -def _parse_line(line: str): +def _parse_line(line: str, remove_spaces: bool = False) -> list: """ Parse one Kraken2 report line. - Standard format (6 columns): - pct cum_reads direct_reads rank taxid name(indented) - - Returns (name, depth, rank, cum_reads, pct) or None on malformed input. + Returns [name, level_num, level_type, all_reads, percents] + or empty list on malformed input. """ parts = line.rstrip("\n").split("\t") - if len(parts) < 5: - return None + if len(parts) < 4: + return [] try: - pct = float(parts[0]) - cum_reads = int(parts[1]) + int(parts[1]) except ValueError: - return None + return [] - rank = parts[3].strip() - name_field = parts[-1] # always the last column regardless of format variant + try: + percents = float(parts[0]) + except ValueError: + return [] + all_reads = int(parts[1]) + + try: + int(parts[-3]) + level_type = parts[-2].strip() + map_kuniq = { + "species": "S", + "genus": "G", + "family": "F", + "order": "O", + "class": "C", + "phylum": "P", + "superkingdom": "D", + "kingdom": "K", + } + if level_type not in map_kuniq: + level_type = "-" + else: + level_type = map_kuniq[level_type] + except ValueError: + level_type = parts[-3].strip() - depth = 0 - for ch in name_field: + name = parts[-1] + spaces = 0 + for ch in name: if ch == " ": - depth += 1 + spaces += 1 else: break - name = name_field.strip() + name = name.strip() + if remove_spaces: + name = name.replace(" ", "_") - return name, depth // 2, rank, cum_reads, pct + level_num = spaces / 2 + return [name, level_num, level_type, all_reads, percents] def kreport_to_mpa( @@ -67,54 +81,63 @@ def kreport_to_mpa( """ Convert a single Kraken2 report to MPA format. - Uses a stack to track the current taxonomic path. Each entry is - (structural_depth, mpa_segment, is_standard_rank). When a node at - depth d is encountered, all stack entries with depth >= d are popped - before the new entry is pushed, keeping the path consistent. + Tracks the current taxonomic path via curr_path and prev_lvl_num, + popping the stack when moving back up the tree — exactly as the + original script does. """ if not Path(report_path).is_file(): raise FileNotFoundError(f"Input file not found: {report_path}") out_path = ensure_output_dir(output_path, is_file=True) - # Stack entries: (structural_depth, mpa_segment, is_standard_rank) - stack: list[tuple[int, str, bool]] = [] + + curr_path: list[str] = [] + prev_lvl_num = -1 with open(report_path) as r_fh, open(out_path, "w") as o_fh: if display_header: o_fh.write("#Classification\t" + os.path.basename(report_path) + "\n") for line in r_fh: - parsed = _parse_line(line) - if parsed is None: + report_vals = _parse_line(line, remove_spaces) + if len(report_vals) < 5: continue - name, depth, rank, cum_reads, pct = parsed - # Skip unclassified and root — never appear in MPA output - if rank in ("U", "R"): + name, level_num, level_type, all_reads, percents = report_vals + + # Пропускаем unclassified + if level_type == "U": continue - # Strip numeric suffix to get base rank (e.g. "S1" → "S", "G2" → "G") - rank_base = rank.rstrip("0123456789") - is_standard = rank_base in _RANK_PREFIX and rank == rank_base + # Нормализуем тип уровня + if level_type not in _MAIN_LVLS: + level_type = "x" + elif level_type == "K": + level_type = "k" + elif level_type == "D": + level_type = "d" - if not is_standard and not include_intermediate: - continue + level_str = level_type.lower() + "__" + name - prefix = _RANK_PREFIX.get(rank_base, "x") - seg_name = name.replace(" ", "_") if remove_spaces else name - mpa_seg = f"{prefix}__{seg_name}" + if prev_lvl_num == -1: + prev_lvl_num = level_num + curr_path.append(level_str) + continue - # Trim stack to the current structural depth - while stack and stack[-1][0] >= depth: - stack.pop() - stack.append((depth, mpa_seg, is_standard)) + while level_num != (prev_lvl_num + 1): + prev_lvl_num -= 1 + curr_path.pop() - # Build the full MPA path; omit intermediate (x__) segments when not requested - path = "|".join( - seg for (_, seg, std) in stack if include_intermediate or std - ) + if (level_type == "x" and include_intermediate) or level_type != "x": + ancestors = [ + seg + for seg in curr_path + if (seg[0] != "x" or include_intermediate) and seg[0] != "r" + ] + path = "|".join(ancestors + [level_str]) + value = str(all_reads) if use_reads else str(percents) + o_fh.write(path + "\t" + value + "\n") - value = str(cum_reads) if use_reads else str(pct) - o_fh.write(f"{path}\t{value}\n") + curr_path.append(level_str) + prev_lvl_num = level_num def main() -> None: @@ -137,7 +160,6 @@ def main() -> None: dest="input_dir", help="Input directory containing Kraken2 report files (batch mode)", ) - parser.add_argument( "-o", "--output", @@ -212,7 +234,7 @@ def main() -> None: continue out_name = f.name.replace(".kreport", ".MPA.TXT") kreport_to_mpa(str(f), str(output_dir / out_name), **kwargs) - _log.info(f"Converted to MPA successfully. Output stored in {output_dir}") + _log.info("Converted to MPA successfully. Output stored in %s", output_dir) else: kreport_to_mpa(args.r_file, args.o_file, **kwargs) From 06eedbbe8ce77af3b0351cd09a251636e9c88880 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Sun, 31 May 2026 16:50:19 +0200 Subject: [PATCH 7/9] test: update unit tests for _parse_line return type changes --- tests/test_units.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_units.py b/tests/test_units.py index 414d87d..598d307 100644 --- a/tests/test_units.py +++ b/tests/test_units.py @@ -43,15 +43,15 @@ def test_parse_line_intermediate_rank(): def test_parse_line_too_few_columns(): - assert _parse_line("50.00\t500000\n") is None + assert _parse_line("50.00\t500000\n") == [] def test_parse_line_non_numeric_pct(): - assert _parse_line("not_a_float\t500000\t0\tP\t1224\tBacteria\n") is None + assert _parse_line("not_a_float\t500000\t0\tP\t1224\tBacteria\n") == [] def test_parse_line_non_numeric_reads(): - assert _parse_line("50.00\tnot_int\t0\tP\t1224\tBacteria\n") is None + assert _parse_line("50.00\tnot_int\t0\tP\t1224\tBacteria\n") == [] # --------------------------------------------------------------------------- From 54bf8e7cdcc57ad0b276665d6270576f05c2f190 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Sun, 31 May 2026 16:53:52 +0200 Subject: [PATCH 8/9] refactor: improve CLI startup output visibility --- krakenparser/krakenparser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/krakenparser/krakenparser.py b/krakenparser/krakenparser.py index d094aa0..523a3f5 100755 --- a/krakenparser/krakenparser.py +++ b/krakenparser/krakenparser.py @@ -13,6 +13,7 @@ def main(): + print("KrakenParser by Ilia V. Popov") logging.basicConfig(level=logging.INFO, format="%(message)s") package_dir = Path(__file__).resolve().parent @@ -219,7 +220,6 @@ def _build_cmd( sys.exit(result.returncode) # Fallback to usage overview if no actionable arguments were provided - print("KrakenParser by Ilia V. Popov") parser.print_help() From 766ce5243e8ca851ff7cc74d66fbeffe004f3faa Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Sun, 31 May 2026 17:40:37 +0200 Subject: [PATCH 9/9] docs: update results directory structure in README --- README.md | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index ecc02f1..d96b894 100644 --- a/README.md +++ b/README.md @@ -305,16 +305,17 @@ results/ │ ├─ alpha_div.csv │ ├─ beta_div_bray.csv │ └─ beta_div_jaccard.csv -└─ intermediate/ # Intermediate files - ├─ mpa/ # Converted MPA files - │ ├─ {sample}.txt - │ ├─ ... - ├─ COMBINED.txt # Merged MPA table - └─ txt/ # Extracted taxonomic levels in TXT - ├─ counts_species.txt - ├─ counts_genus.txt - ├─ ... - └─ counts_phylum.txt +├─ intermediate/ # Intermediate files +│ ├─ mpa/ # Converted MPA files +│ │ ├─ {sample}.txt +│ │ ├─ ... +│ ├─ COMBINED.txt # Merged MPA table +│ └─ txt/ # Extracted taxonomic levels in TXT +│ ├─ counts_species.txt +│ ├─ counts_genus.txt +│ ├─ ... +│ └─ counts_phylum.txt +└─ krakenparser.log # Pipeline execution logs ``` ## Conclusion