From aa05ad05797e74ee285cdf12b831279ce2b11c64 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 12 May 2026 18:23:49 +0100 Subject: [PATCH] fix(bam): emit SAM-spec NM:i: tag distinct from STAR-internal nM:i: --outSAMattributes NM was being routed to the existing nM writer (substitutions only), so requests for the SAM-spec edit-distance tag silently produced wrong values. Downstream tools that read NM:i: (samtools stats, picard, MultiQC) saw nothing. Treat NM and nM as distinct attribute tokens. When NM is requested, compute SAM-spec edit distance per the SAM v1 spec section 1.4: substitutions + inserted bases + deleted bases (excluding intron N skips). Keep nM emission unchanged when explicitly requested. Fixes #29 --- src/io/sam.rs | 287 +++++++++++++++++++++++++++++++------------------- 1 file changed, 179 insertions(+), 108 deletions(-) diff --git a/src/io/sam.rs b/src/io/sam.rs index 9e5cff5..7049d4b 100644 --- a/src/io/sam.rs +++ b/src/io/sam.rs @@ -462,8 +462,16 @@ impl SamWriter { if attrs.contains("AS") { data.insert(Tag::ALIGNMENT_SCORE, Value::from(mapped_transcript.score)); } - if attrs.contains("NM") || attrs.contains("nM") { - // STAR maps NM attribute to 'nM' tag (mismatches only, not edit distance) + if attrs.contains("NM") { + data.insert( + Tag::EDIT_DISTANCE, + Value::from(sam_spec_nm( + mapped_transcript.n_mismatch, + &mapped_transcript.cigar, + )), + ); + } + if attrs.contains("nM") { data.insert( Tag::new(b'n', b'M'), Value::from(mapped_transcript.n_mismatch as i32), @@ -649,7 +657,13 @@ impl SamWriter { if attrs.contains("AS") { data.insert(Tag::ALIGNMENT_SCORE, Value::from(t.score)); } - if attrs.contains("NM") || attrs.contains("nM") { + if attrs.contains("NM") { + data.insert( + Tag::EDIT_DISTANCE, + Value::from(sam_spec_nm(t.n_mismatch, &t.cigar)), + ); + } + if attrs.contains("nM") { data.insert(Tag::new(b'n', b'M'), Value::from(t.n_mismatch as i32)); } @@ -947,8 +961,13 @@ fn transcript_to_record( if attrs.contains("AS") { data.insert(Tag::ALIGNMENT_SCORE, Value::from(transcript.score)); } - if attrs.contains("NM") || attrs.contains("nM") { - // STAR maps NM attribute to 'nM' tag (mismatches only, not edit distance) + if attrs.contains("NM") { + data.insert( + Tag::EDIT_DISTANCE, + Value::from(sam_spec_nm(transcript.n_mismatch, &transcript.cigar)), + ); + } + if attrs.contains("nM") { data.insert( Tag::new(b'n', b'M'), Value::from(transcript.n_mismatch as i32), @@ -977,19 +996,17 @@ fn transcript_to_record( Ok(record) } -/// Compute edit distance (mismatches + inserted + deleted bases). -/// Not emitted in SAM output (STAR maps NM attribute to 'nM' tag), but kept for tests. -#[cfg(test)] -fn compute_edit_distance(transcript: &Transcript) -> i32 { - let indel_bases: u32 = transcript - .cigar +/// SAM-spec `NM:i:` edit distance: substitutions + inserted bases + deleted bases. +/// Intron `N` skips and clips are excluded per SAM v1 §1.4. +fn sam_spec_nm(n_mismatch: u32, cigar: &[CigarOp]) -> i32 { + let indel_bases: u32 = cigar .iter() .filter_map(|op| match op { CigarOp::Ins(n) | CigarOp::Del(n) => Some(*n), _ => None, }) .sum(); - (transcript.n_mismatch + indel_bases) as i32 + (n_mismatch + indel_bases) as i32 } /// Derive XS strand tag from transcript junction motifs. @@ -1280,8 +1297,13 @@ fn build_paired_mate_record( // STAR reports combined score (sum of both mates) for PE AS tag data.insert(Tag::ALIGNMENT_SCORE, Value::from(combined_score)); } - if attrs.contains("NM") || attrs.contains("nM") { - // STAR maps NM attribute to 'nM' tag (mismatches only, not edit distance) + if attrs.contains("NM") { + data.insert( + Tag::EDIT_DISTANCE, + Value::from(sam_spec_nm(transcript.n_mismatch, &transcript.cigar)), + ); + } + if attrs.contains("nM") { data.insert( Tag::new(b'n', b'M'), Value::from(transcript.n_mismatch as i32), @@ -1829,106 +1851,63 @@ mod tests { Some(&Value::from(100_i32)), "AS tag should be 100" ); - // STAR maps NM attribute to 'nM' tag (mismatches only); no standard NM tag assert_eq!( data.get(&Tag::EDIT_DISTANCE), - None, - "Standard NM tag should not be present (STAR outputs nM not NM)" + Some(&Value::from(5_i32)), + "NM should be edit distance (2 mismatches + 3 deleted bases)" ); assert_eq!( data.get(&Tag::new(b'n', b'M')), Some(&Value::from(2_i32)), - "nM tag should be 2 (mismatches only, both NM and nM attrs map here)" + "nM should be mismatches only (2)" ); } #[test] fn test_edit_distance_computation() { // Pure match: NM = n_mismatch only - let t1 = Transcript { - chr_idx: 0, - genome_start: 0, - genome_end: 50, - is_reverse: false, - exons: vec![], - cigar: vec![CigarOp::Match(50)], - score: 100, - n_mismatch: 3, - n_gap: 0, - n_junction: 0, - junction_motifs: vec![], - junction_annotated: vec![], - read_seq: vec![], - }; - assert_eq!(compute_edit_distance(&t1), 3); + assert_eq!(sam_spec_nm(3, &[CigarOp::Match(50)]), 3); // Match + Ins + Del: NM = n_mismatch + ins_len + del_len - let t2 = Transcript { - chr_idx: 0, - genome_start: 0, - genome_end: 60, - is_reverse: false, - exons: vec![], - cigar: vec![ - CigarOp::Match(20), - CigarOp::Ins(5), - CigarOp::Match(10), - CigarOp::Del(7), - CigarOp::Match(20), - ], - score: 80, - n_mismatch: 1, - n_gap: 2, - n_junction: 0, - junction_motifs: vec![], - junction_annotated: vec![], - read_seq: vec![], - }; - assert_eq!(compute_edit_distance(&t2), 13); // 1 + 5 + 7 + assert_eq!( + sam_spec_nm( + 1, + &[ + CigarOp::Match(20), + CigarOp::Ins(5), + CigarOp::Match(10), + CigarOp::Del(7), + CigarOp::Match(20), + ], + ), + 13 + ); // RefSkip (splice junction) should NOT count toward NM - let t3 = Transcript { - chr_idx: 0, - genome_start: 0, - genome_end: 200, - is_reverse: false, - exons: vec![], - cigar: vec![ - CigarOp::Match(25), - CigarOp::RefSkip(1000), - CigarOp::Match(25), - ], - score: 50, - n_mismatch: 0, - n_gap: 0, - n_junction: 1, - junction_motifs: vec![], - junction_annotated: vec![], - read_seq: vec![], - }; - assert_eq!(compute_edit_distance(&t3), 0); + assert_eq!( + sam_spec_nm( + 0, + &[ + CigarOp::Match(25), + CigarOp::RefSkip(1000), + CigarOp::Match(25), + ], + ), + 0 + ); // Soft clips should NOT count toward NM - let t4 = Transcript { - chr_idx: 0, - genome_start: 0, - genome_end: 40, - is_reverse: false, - exons: vec![], - cigar: vec![ - CigarOp::SoftClip(10), - CigarOp::Match(40), - CigarOp::SoftClip(10), - ], - score: 40, - n_mismatch: 2, - n_gap: 0, - n_junction: 0, - junction_motifs: vec![], - junction_annotated: vec![], - read_seq: vec![], - }; - assert_eq!(compute_edit_distance(&t4), 2); + assert_eq!( + sam_spec_nm( + 2, + &[ + CigarOp::SoftClip(10), + CigarOp::Match(40), + CigarOp::SoftClip(10), + ], + ), + 2 + ); } #[test] @@ -1976,11 +1955,8 @@ mod tests { ); assert_eq!(data.get(&Tag::HIT_INDEX), Some(&Value::from(1_i32))); assert_eq!(data.get(&Tag::ALIGNMENT_SCORE), Some(&Value::from(100_i32))); - // Standard NM tag absent (STAR maps NM→nM) - assert_eq!(data.get(&Tag::EDIT_DISTANCE), None); - // nM = 2 mismatches (NM and nM attrs both map to nM tag) + assert_eq!(data.get(&Tag::EDIT_DISTANCE), Some(&Value::from(2_i32))); assert_eq!(data.get(&Tag::new(b'n', b'M')), Some(&Value::from(2_i32))); - // XS not in standard attrs assert_eq!(data.get(&Tag::new(b'X', b'S')), None); } @@ -3081,12 +3057,12 @@ mod tests { "AS should be present" ); assert!( - data.get(&Tag::EDIT_DISTANCE).is_none(), - "Standard NM tag should be absent (STAR maps NM→nM)" + data.get(&Tag::EDIT_DISTANCE).is_some(), + "NM should be present" ); assert!( data.get(&Tag::new(b'n', b'M')).is_some(), - "nM should be present (NM and nM attrs both map here)" + "nM should be present" ); // Non-standard tags absent assert!( @@ -3360,17 +3336,112 @@ mod tests { .unwrap(); let data = record.data(); - // Standard NM tag absent (STAR maps NM attribute to 'nM' tag) assert_eq!( data.get(&Tag::EDIT_DISTANCE), - None, - "Standard NM tag should be absent" + Some(&Value::from(10_i32)), + "NM should be edit distance: 2 mismatches + 5 ins + 3 del = 10" ); - // nM = mismatches only: 2 (both NM and nM attrs map here) assert_eq!( data.get(&Tag::new(b'n', b'M')), Some(&Value::from(2_i32)), - "nM should be 2 (mismatches only, NM+nM attrs both map to nM tag)" + "nM should be 2 (mismatches only)" + ); + } + + #[test] + fn test_nm_tag_distinct_from_nm_lower() { + let genome = make_test_genome(); + + let transcript = Transcript { + chr_idx: 0, + genome_start: 0, + genome_end: 103, + is_reverse: false, + exons: vec![], + cigar: vec![CigarOp::Match(80), CigarOp::Del(2), CigarOp::Match(21)], + score: 100, + n_mismatch: 1, + n_gap: 1, + n_junction: 0, + junction_motifs: vec![], + junction_annotated: vec![], + read_seq: vec![0, 1, 2, 3], + }; + + let read_seq = vec![0, 1, 2, 3]; + let read_qual = vec![30, 30, 30, 30]; + + let both: HashSet = ["NM", "nM"].iter().map(|s| s.to_string()).collect(); + let record = transcript_to_record( + &transcript, + "read1", + &read_seq, + &read_qual, + &genome, + 255, + 1, + 1, + &both, + ) + .unwrap(); + let data = record.data(); + assert_eq!( + data.get(&Tag::EDIT_DISTANCE), + Some(&Value::from(3_i32)), + "NM should be 1 mismatch + 2 deleted bases = 3" + ); + assert_eq!( + data.get(&Tag::new(b'n', b'M')), + Some(&Value::from(1_i32)), + "nM should be 1 (mismatches only)" + ); + + let only_lower: HashSet = ["nM"].iter().map(|s| s.to_string()).collect(); + let record = transcript_to_record( + &transcript, + "read1", + &read_seq, + &read_qual, + &genome, + 255, + 1, + 1, + &only_lower, + ) + .unwrap(); + let data = record.data(); + assert!( + data.get(&Tag::EDIT_DISTANCE).is_none(), + "NM should be absent when only nM is requested" + ); + assert_eq!( + data.get(&Tag::new(b'n', b'M')), + Some(&Value::from(1_i32)), + "nM should still be 1" + ); + + let only_upper: HashSet = ["NM"].iter().map(|s| s.to_string()).collect(); + let record = transcript_to_record( + &transcript, + "read1", + &read_seq, + &read_qual, + &genome, + 255, + 1, + 1, + &only_upper, + ) + .unwrap(); + let data = record.data(); + assert_eq!( + data.get(&Tag::EDIT_DISTANCE), + Some(&Value::from(3_i32)), + "NM should be present when only NM is requested" + ); + assert!( + data.get(&Tag::new(b'n', b'M')).is_none(), + "nM should be absent when only NM is requested" ); }