From ed59a6c3979e6bc6316400c049a03c134452f557 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 12 May 2026 18:04:12 +0100 Subject: [PATCH 1/2] feat(bam): populate @PG header with PN/VN/CL fields MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per SAM spec §1.3, the @PG line conventionally carries PN (program name), VN (version), and CL (command line) alongside ID. rustar was emitting only ID:rustar-aligner, leaving downstream provenance tools (MultiQC's program-version table, dx-toolkit lineage tracking) with a blank entry. Expand the header writer to emit: @PG\tID:rustar-aligner\tPN:rustar-aligner\tVN:\tCL: The full command line is captured in main() before clap parses it, then threaded into Parameters via a new (skip) field so it reaches the SAM header builder. Version comes from CARGO_PKG_VERSION at compile time. This matches STAR's @PG format and gives downstream tools the provenance they need. Fixes #33 (the @PG header gap; AS divergence is a separate item). --- src/io/sam.rs | 78 +++++++++++++++++++++++++++++++++++++++++++++++++-- src/main.rs | 4 ++- src/params.rs | 7 +++++ 3 files changed, 85 insertions(+), 4 deletions(-) diff --git a/src/io/sam.rs b/src/io/sam.rs index 9e5cff5..fce9460 100644 --- a/src/io/sam.rs +++ b/src/io/sam.rs @@ -17,7 +17,7 @@ use noodles::sam::alignment::record_buf::data::field::value::Array; use noodles::sam::alignment::record_buf::{QualityScores, RecordBuf, Sequence}; use noodles::sam::header::record::value::{ Map, - map::{Program, ReadGroup, tag::Other as HeaderOtherTag}, + map::{Program, ReadGroup, program::tag as program_tag, tag::Other as HeaderOtherTag}, }; use std::collections::HashSet; use std::fmt::Write as FmtWrite; @@ -843,8 +843,23 @@ where builder = builder.add_read_group(id, map); } - // @PG line - builder = builder.add_program("rustar-aligner", Map::::default()); + // @PG line. Per SAM spec §1.3, populate PN/VN/CL alongside ID so downstream + // provenance tools (MultiQC program-version table, etc.) see a fully + // populated record matching STAR's @PG format. + let mut pg = Map::::default(); + pg.other_fields_mut() + .insert(program_tag::NAME, BString::from("rustar-aligner")); + pg.other_fields_mut().insert( + program_tag::VERSION, + BString::from(env!("CARGO_PKG_VERSION")), + ); + let cl = params + .command_line + .clone() + .unwrap_or_else(|| "rustar-aligner".to_string()); + pg.other_fields_mut() + .insert(program_tag::COMMAND_LINE, BString::from(cl)); + builder = builder.add_program("rustar-aligner", pg); Ok(builder.build()) } @@ -1373,6 +1388,63 @@ mod tests { assert_eq!(header.reference_sequences().len(), 1); } + #[test] + fn test_build_sam_header_pg_line_populated() { + // The @PG line must carry PN, VN and CL alongside ID per SAM spec §1.3, + // so downstream provenance tools (MultiQC etc.) get a non-blank entry. + let genome = make_test_genome(); + let mut params = Parameters::parse_from(vec!["rustar-aligner", "--readFilesIn", "test.fq"]); + params.command_line = + Some("rustar-aligner --readFilesIn test.fq --runThreadN 4".to_string()); + + let header = build_sam_header(&genome, ¶ms).unwrap(); + let programs = header.programs().as_ref(); + let pg = programs + .get(&b"rustar-aligner"[..]) + .expect("@PG line with ID:rustar-aligner must be present"); + + let pn: &[u8] = pg + .other_fields() + .get(&program_tag::NAME) + .expect("PN field must be present") + .as_ref(); + assert_eq!(pn, b"rustar-aligner"); + + let vn: &[u8] = pg + .other_fields() + .get(&program_tag::VERSION) + .expect("VN field must be present") + .as_ref(); + assert_eq!(vn, env!("CARGO_PKG_VERSION").as_bytes()); + + let cl: &[u8] = pg + .other_fields() + .get(&program_tag::COMMAND_LINE) + .expect("CL field must be present") + .as_ref(); + assert!(!cl.is_empty(), "CL field must be non-empty"); + assert_eq!(cl, b"rustar-aligner --readFilesIn test.fq --runThreadN 4"); + } + + #[test] + fn test_build_sam_header_pg_line_default_cl_when_unset() { + // When command_line is None (e.g. tests, library use), fall back to + // the program name so CL is still non-empty. + let genome = make_test_genome(); + let params = Parameters::parse_from(vec!["rustar-aligner", "--readFilesIn", "test.fq"]); + assert!(params.command_line.is_none()); + + let header = build_sam_header(&genome, ¶ms).unwrap(); + let programs = header.programs().as_ref(); + let pg = programs.get(&b"rustar-aligner"[..]).unwrap(); + let cl: &[u8] = pg + .other_fields() + .get(&program_tag::COMMAND_LINE) + .expect("CL field must be present even when command_line is None") + .as_ref(); + assert!(!cl.is_empty()); + } + #[test] fn test_build_sam_header_with_rg() { let genome = make_test_genome(); diff --git a/src/main.rs b/src/main.rs index 8979239..4cb56fd 100644 --- a/src/main.rs +++ b/src/main.rs @@ -8,6 +8,8 @@ fn main() -> anyhow::Result<()> { cpu::check_cpu_compat()?; - let params = Parameters::parse(); + let command_line = std::env::args().collect::>().join(" "); + let mut params = Parameters::parse(); + params.command_line = Some(command_line); rustar_aligner::run(¶ms) } diff --git a/src/params.rs b/src/params.rs index 9c124b8..c4b4346 100644 --- a/src/params.rs +++ b/src/params.rs @@ -701,6 +701,13 @@ pub struct Parameters { /// Chimeric output type #[arg(long = "chimOutType", num_args = 1..=2, default_values_t = vec!["Junctions".to_string()])] pub chim_out_type: Vec, + + /// Full command line as invoked (captured in `main` before clap parsing). + /// Not a CLI argument; populated programmatically and embedded in the + /// BAM `@PG` `CL:` field for provenance. STAR captures the same string + /// in `P.commandLineFull`. + #[arg(skip)] + pub command_line: Option, } impl Parameters { From 0e49eddd69ca8ff95cc248cc9600c6b5ede1985a Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 12 May 2026 18:12:38 +0100 Subject: [PATCH 2/2] chore(bam): remove narrative comments --- src/io/sam.rs | 7 ------- src/params.rs | 5 +---- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/src/io/sam.rs b/src/io/sam.rs index fce9460..b288b4e 100644 --- a/src/io/sam.rs +++ b/src/io/sam.rs @@ -843,9 +843,6 @@ where builder = builder.add_read_group(id, map); } - // @PG line. Per SAM spec §1.3, populate PN/VN/CL alongside ID so downstream - // provenance tools (MultiQC program-version table, etc.) see a fully - // populated record matching STAR's @PG format. let mut pg = Map::::default(); pg.other_fields_mut() .insert(program_tag::NAME, BString::from("rustar-aligner")); @@ -1390,8 +1387,6 @@ mod tests { #[test] fn test_build_sam_header_pg_line_populated() { - // The @PG line must carry PN, VN and CL alongside ID per SAM spec §1.3, - // so downstream provenance tools (MultiQC etc.) get a non-blank entry. let genome = make_test_genome(); let mut params = Parameters::parse_from(vec!["rustar-aligner", "--readFilesIn", "test.fq"]); params.command_line = @@ -1428,8 +1423,6 @@ mod tests { #[test] fn test_build_sam_header_pg_line_default_cl_when_unset() { - // When command_line is None (e.g. tests, library use), fall back to - // the program name so CL is still non-empty. let genome = make_test_genome(); let params = Parameters::parse_from(vec!["rustar-aligner", "--readFilesIn", "test.fq"]); assert!(params.command_line.is_none()); diff --git a/src/params.rs b/src/params.rs index c4b4346..01c975f 100644 --- a/src/params.rs +++ b/src/params.rs @@ -702,10 +702,7 @@ pub struct Parameters { #[arg(long = "chimOutType", num_args = 1..=2, default_values_t = vec!["Junctions".to_string()])] pub chim_out_type: Vec, - /// Full command line as invoked (captured in `main` before clap parsing). - /// Not a CLI argument; populated programmatically and embedded in the - /// BAM `@PG` `CL:` field for provenance. STAR captures the same string - /// in `P.commandLineFull`. + /// Full command line as invoked, embedded in the BAM `@PG` `CL:` field. #[arg(skip)] pub command_line: Option, }