From ed39f9c0174e91acd1de7bd93fe7602585005aa3 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Fri, 15 May 2026 09:35:48 +0200 Subject: [PATCH 1/4] chore: unify cigar stringification --- src/align/read_align.rs | 6 ++---- src/align/transcript.rs | 30 ++++++++++++++++++------------ src/chimeric/output.rs | 39 +++------------------------------------ src/chimeric/segment.rs | 7 ++++++- src/io/sam.rs | 8 ++++---- src/lib.rs | 2 -- 6 files changed, 33 insertions(+), 59 deletions(-) diff --git a/src/align/read_align.rs b/src/align/read_align.rs index 91321ad..831e032 100644 --- a/src/align/read_align.rs +++ b/src/align/read_align.rs @@ -323,7 +323,6 @@ pub fn align_read( } else { "unknown" }; - let cigar_str: String = t.cigar.iter().map(|op| format!("{}", op)).collect(); eprintln!( " transcript[{}]: chr={}:{}-{} ({}) score={} mm={} junctions={} cigar={}", ti, @@ -334,7 +333,7 @@ pub fn align_read( t.score, t.n_mismatch, t.n_junction, - cigar_str + t.cigar_string() ); } } @@ -633,7 +632,6 @@ pub fn align_read( } else { "unknown" }; - let cigar_str: String = t.cigar.iter().map(|op| format!("{}", op)).collect(); eprintln!( " FINAL[{}]: chr={}:{}-{} ({}) score={} mm={} junctions={} cigar={}", i, @@ -644,7 +642,7 @@ pub fn align_read( t.score, t.n_mismatch, t.n_junction, - cigar_str + t.cigar_string() ); } } diff --git a/src/align/transcript.rs b/src/align/transcript.rs index efc8de7..3be2959 100644 --- a/src/align/transcript.rs +++ b/src/align/transcript.rs @@ -1,5 +1,5 @@ /// Transcript data structures for storing alignment results -use std::fmt; +use std::fmt::Write as _; /// A complete alignment of a read to the genome #[derive(Debug, Clone)] @@ -134,16 +134,18 @@ impl CigarOp { } } -impl fmt::Display for CigarOp { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{}{}", self.len(), self.op_char()) - } +/// Convert CIGAR operations to CIGAR string +pub(crate) fn cigar_to_string(cigar: &[CigarOp]) -> String { + cigar.iter().fold(String::new(), |mut c, op| { + let _ = write!(c, "{}{}", op.len(), op.op_char()); // infallible + c + }) } impl Transcript { /// Format CIGAR string pub fn cigar_string(&self) -> String { - self.cigar.iter().map(|op| op.to_string()).collect() + cigar_to_string(&self.cigar) } /// Calculate number of matched bases (for filtering) @@ -201,12 +203,16 @@ mod tests { use super::*; #[test] - fn test_cigar_op_display() { - assert_eq!(CigarOp::Match(50).to_string(), "50M"); - assert_eq!(CigarOp::Ins(3).to_string(), "3I"); - assert_eq!(CigarOp::Del(2).to_string(), "2D"); - assert_eq!(CigarOp::RefSkip(1000).to_string(), "1000N"); - assert_eq!(CigarOp::SoftClip(5).to_string(), "5S"); + fn test_cigar_to_string() { + let cigar = vec![ + CigarOp::Match(50), + CigarOp::Ins(2), + CigarOp::Del(3), + CigarOp::RefSkip(1000), + CigarOp::SoftClip(5), + ]; + + assert_eq!(cigar_to_string(&cigar), "50M2I3D1000N5S"); } #[test] diff --git a/src/chimeric/output.rs b/src/chimeric/output.rs index dade145..422f0d2 100644 --- a/src/chimeric/output.rs +++ b/src/chimeric/output.rs @@ -78,8 +78,8 @@ impl ChimericJunctionWriter { let acceptor_start = alignment.acceptor.genome_start + 1; // Convert CIGAR to string - let donor_cigar = cigar_to_string(&alignment.donor.cigar); - let acceptor_cigar = cigar_to_string(&alignment.acceptor.cigar); + let donor_cigar = alignment.donor.cigar_string(); + let acceptor_cigar = alignment.acceptor.cigar_string(); // Write line writeln!( @@ -162,7 +162,7 @@ fn format_sa_entry( let chr_start = chr_starts[seg.chr_idx]; let pos = seg.genome_start - chr_start + 1; // 1-based per-chr let strand = if seg.is_reverse { '-' } else { '+' }; - let cigar = cigar_to_string(&seg.cigar); + let cigar = seg.cigar_string(); format!( "{},{},{},{},{},{};", chr, pos, strand, cigar, mapq, seg.n_mismatch @@ -233,26 +233,6 @@ fn build_segment_record( Ok(record) } -/// Convert CIGAR operations to CIGAR string -fn cigar_to_string(cigar: &[crate::align::transcript::CigarOp]) -> String { - use crate::align::transcript::CigarOp; - - let mut result = String::new(); - for op in cigar { - match op { - CigarOp::Match(len) => result.push_str(&format!("{}M", len)), - CigarOp::Equal(len) => result.push_str(&format!("{}=", len)), - CigarOp::Diff(len) => result.push_str(&format!("{}X", len)), - CigarOp::Ins(len) => result.push_str(&format!("{}I", len)), - CigarOp::Del(len) => result.push_str(&format!("{}D", len)), - CigarOp::RefSkip(len) => result.push_str(&format!("{}N", len)), - CigarOp::SoftClip(len) => result.push_str(&format!("{}S", len)), - CigarOp::HardClip(len) => result.push_str(&format!("{}H", len)), - } - } - result -} - #[cfg(test)] mod tests { use super::*; @@ -261,19 +241,6 @@ mod tests { use std::io::Read; use tempfile::tempdir; - #[test] - fn test_cigar_to_string() { - let cigar = vec![ - CigarOp::Match(50), - CigarOp::Ins(2), - CigarOp::Del(3), - CigarOp::RefSkip(1000), - CigarOp::SoftClip(5), - ]; - - assert_eq!(cigar_to_string(&cigar), "50M2I3D1000N5S"); - } - #[test] fn test_chimeric_junction_writer_creation() { let dir = tempdir().unwrap(); diff --git a/src/chimeric/segment.rs b/src/chimeric/segment.rs index 1f48e75..9a5d0a9 100644 --- a/src/chimeric/segment.rs +++ b/src/chimeric/segment.rs @@ -1,6 +1,6 @@ // ChimericSegment and ChimericAlignment data structures -use crate::align::transcript::CigarOp; +use crate::align::transcript::{CigarOp, cigar_to_string}; /// A single segment of a chimeric alignment #[derive(Debug, Clone)] @@ -31,6 +31,11 @@ impl ChimericSegment { pub fn meets_min_length(&self, min_len: u32) -> bool { self.read_length() >= min_len as usize } + + /// Format CIGAR string + pub fn cigar_string(&self) -> String { + cigar_to_string(&self.cigar) + } } /// A chimeric alignment consisting of two segments diff --git a/src/io/sam.rs b/src/io/sam.rs index ee91a43..66a23a8 100644 --- a/src/io/sam.rs +++ b/src/io/sam.rs @@ -149,10 +149,10 @@ impl SamWriter { .name() .map(|n| String::from_utf8_lossy(n.as_ref()).to_string()) .unwrap_or_default(); - let cigar_str: String = cigar_ops - .iter() - .map(|op| format!("{}{:?}", op.len(), op.kind())) - .collect::(); + let cigar_str = cigar_ops.iter().fold(String::new(), |mut c, op| { + let _ = write!(c, "{}{:?}", op.len(), op.kind()); // infallible + c + }); panic!( "[SAM-MISMATCH] read={} cigar_query_len={} seq_len={} flags={:?} cigar={}", name, diff --git a/src/lib.rs b/src/lib.rs index 09472fa..26e788e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,8 +7,6 @@ clippy::cast_precision_loss, clippy::cast_sign_loss, clippy::doc_markdown, - clippy::format_collect, - clippy::format_push_string, clippy::items_after_statements, clippy::match_same_arms, clippy::missing_errors_doc, From 323ea32eef15230b9d6002a755c7dd25b1bbcf0e Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Sat, 16 May 2026 01:03:12 +0200 Subject: [PATCH 2/4] switch to upstream --- src/align/mod.rs | 2 +- src/align/read_align.rs | 81 +++++----- src/align/stitch.rs | 135 ++++++++-------- src/align/transcript.rs | 205 +++++++++--------------- src/chimeric/detect.rs | 17 +- src/chimeric/output.rs | 35 +++-- src/chimeric/segment.rs | 10 +- src/io/bam.rs | 7 +- src/io/sam.rs | 315 +++++++++++++++++++------------------ src/lib.rs | 56 +++---- src/mapq.rs | 3 +- src/quant/transcriptome.rs | 156 +++++++++++------- src/stats.rs | 43 ++--- 13 files changed, 542 insertions(+), 523 deletions(-) diff --git a/src/align/mod.rs b/src/align/mod.rs index 34c9cd5..1679ced 100644 --- a/src/align/mod.rs +++ b/src/align/mod.rs @@ -10,4 +10,4 @@ pub use read_align::{ }; pub use seed::Seed; pub use stitch::{SeedCluster, WindowAlignment, stitch_seeds}; -pub use transcript::{CigarOp, Exon, Transcript}; +pub use transcript::{Exon, Transcript}; diff --git a/src/align/read_align.rs b/src/align/read_align.rs index 831e032..c824b82 100644 --- a/src/align/read_align.rs +++ b/src/align/read_align.rs @@ -355,14 +355,14 @@ pub fn align_read( a.genome_start, a.genome_end, a.is_reverse, - &a.cigar, + //&a.cigar, ) .cmp(&( b.chr_idx, b.genome_start, b.genome_end, b.is_reverse, - &b.cigar, + //&b.cigar, )) .then_with(|| b.score.cmp(&a.score)) }); @@ -466,13 +466,13 @@ pub fn align_read( // Absolute matched bases let n_matched = t.n_matched(); - if n_matched < params.out_filter_match_nmin { + if n_matched < params.out_filter_match_nmin as usize { *filter_reasons.entry("match_min").or_insert(0) += 1; return false; } // Relative matched bases: STAR casts to uint (u32) - if n_matched < (params.out_filter_match_nmin_over_lread * lread_m1) as u32 { + if (n_matched as f64) < params.out_filter_match_nmin_over_lread * lread_m1 { *filter_reasons.entry("match_min_relative").or_insert(0) += 1; return false; } @@ -518,7 +518,6 @@ pub fn align_read( match motif.implied_strand() { Some('+') => has_plus = true, Some('-') => has_minus = true, - None => {} _ => {} } } @@ -985,10 +984,9 @@ pub fn align_paired_read( if pos_cmp != std::cmp::Ordering::Equal { return pos_cmp; } - b.combined_wt_score - .cmp(&a.combined_wt_score) - .then_with(|| a.mate1_transcript.cigar.cmp(&b.mate1_transcript.cigar)) - .then_with(|| a.mate2_transcript.cigar.cmp(&b.mate2_transcript.cigar)) + b.combined_wt_score.cmp(&a.combined_wt_score) + //.then_with(|| a.mate1_transcript.cigar.cmp(&b.mate1_transcript.cigar)) + //.then_with(|| a.mate2_transcript.cigar.cmp(&b.mate2_transcript.cigar)) }); joint_pairs.dedup_by(|a, b| { a.mate1_transcript.chr_idx == b.mate1_transcript.chr_idx @@ -1414,18 +1412,18 @@ fn extract_junctions_from_cigar(t: &Transcript) -> Vec<(u64, u64)> { let mut junctions = Vec::new(); let mut genome_pos = t.genome_start; for op in &t.cigar { - use crate::align::transcript::CigarOp; - match op { - CigarOp::Match(n) | CigarOp::Equal(n) | CigarOp::Diff(n) | CigarOp::Del(n) => { - genome_pos += *n as u64; + use noodles::sam::alignment::record::cigar::op::Kind; + match op.kind() { + Kind::Match | Kind::SequenceMatch | Kind::SequenceMismatch | Kind::Deletion => { + genome_pos += op.len() as u64; } - CigarOp::RefSkip(n) => { + Kind::Skip => { let donor = genome_pos; - let acceptor = genome_pos + *n as u64; + let acceptor = genome_pos + op.len() as u64; junctions.push((donor, acceptor)); genome_pos = acceptor; } - CigarOp::Ins(_) | CigarOp::SoftClip(_) | CigarOp::HardClip(_) => {} + Kind::Insertion | Kind::SoftClip | Kind::HardClip | Kind::Pad => {} } } junctions @@ -1477,6 +1475,7 @@ mod tests { use crate::index::sa_index::SaIndex; use crate::index::suffix_array::SuffixArray; use clap::Parser; + use noodles::sam::alignment::record::cigar; fn make_test_params() -> Parameters { // Parse empty args to get default parameters @@ -1535,8 +1534,7 @@ mod tests { #[test] fn combined_transcript_for_projection_rewrites_mate2_ifrag() { - use crate::align::transcript::CigarOp; - + use cigar::op::{Kind, Op}; let make_tr = |gs: u64, ge: u64, rs: usize, re: usize| Transcript { chr_idx: 0, genome_start: gs, @@ -1549,7 +1547,7 @@ mod tests { read_end: re, i_frag: 0, }], - cigar: vec![CigarOp::Match((ge - gs) as u32)], + cigar: vec![Op::new(Kind::Match, (ge - gs) as usize)], score: 100, n_mismatch: 0, n_gap: 0, @@ -1652,7 +1650,8 @@ mod tests { #[test] fn test_check_proper_pair_distance() { - use crate::align::transcript::{CigarOp, Exon}; + use crate::align::transcript::Exon; + use cigar::op::{Kind, Op}; let params = make_test_params(); @@ -1669,7 +1668,7 @@ mod tests { read_end: 100, i_frag: 0, }], - cigar: vec![CigarOp::Match(100)], + cigar: vec![Op::new(Kind::Match, 100)], score: 100, n_mismatch: 0, n_gap: 0, @@ -1691,7 +1690,7 @@ mod tests { read_end: 100, i_frag: 0, }], - cigar: vec![CigarOp::Match(100)], + cigar: vec![Op::new(Kind::Match, 100)], score: 100, n_mismatch: 0, n_gap: 0, @@ -1707,7 +1706,8 @@ mod tests { #[test] fn test_check_proper_pair_too_far() { - use crate::align::transcript::{CigarOp, Exon}; + use crate::align::transcript::Exon; + use cigar::op::{Kind, Op}; let mut params = make_test_params(); params.align_mates_gap_max = 100; @@ -1724,7 +1724,7 @@ mod tests { read_end: 100, i_frag: 0, }], - cigar: vec![CigarOp::Match(100)], + cigar: vec![Op::new(Kind::Match, 100)], score: 100, n_mismatch: 0, n_gap: 0, @@ -1746,7 +1746,7 @@ mod tests { read_end: 100, i_frag: 0, }], - cigar: vec![CigarOp::Match(100)], + cigar: vec![Op::new(Kind::Match, 100)], score: 100, n_mismatch: 0, n_gap: 0, @@ -1762,7 +1762,8 @@ mod tests { #[test] fn test_calculate_insert_size_positive() { - use crate::align::transcript::{CigarOp, Exon}; + use crate::align::transcript::Exon; + use cigar::op::{Kind, Op}; // Mate1 is leftmost let t1 = Transcript { @@ -1777,7 +1778,7 @@ mod tests { read_end: 100, i_frag: 0, }], - cigar: vec![CigarOp::Match(100)], + cigar: vec![Op::new(Kind::Match, 100)], score: 100, n_mismatch: 0, n_gap: 0, @@ -1799,7 +1800,7 @@ mod tests { read_end: 100, i_frag: 0, }], - cigar: vec![CigarOp::Match(100)], + cigar: vec![Op::new(Kind::Match, 100)], score: 100, n_mismatch: 0, n_gap: 0, @@ -1815,8 +1816,9 @@ mod tests { #[test] fn test_strand_consistency_filter() { - use crate::align::transcript::{CigarOp, Exon, Transcript}; + use crate::align::transcript::{Exon, Transcript}; use crate::params::IntronStrandFilter; + use cigar::op::{Kind, Op}; // Create a transcript with conflicting strand motifs (mixed + and - within one transcript) let t_inconsistent = Transcript { @@ -1831,7 +1833,7 @@ mod tests { read_end: 100, i_frag: 0, }], - cigar: vec![CigarOp::Match(100)], + cigar: vec![Op::new(Kind::Match, 100)], score: 100, n_mismatch: 0, n_gap: 0, @@ -1854,7 +1856,7 @@ mod tests { read_end: 100, i_frag: 0, }], - cigar: vec![CigarOp::Match(100)], + cigar: vec![Op::new(Kind::Match, 100)], score: 100, n_mismatch: 0, n_gap: 0, @@ -1915,7 +1917,8 @@ mod tests { #[test] fn test_calculate_insert_size_negative() { - use crate::align::transcript::{CigarOp, Exon}; + use crate::align::transcript::Exon; + use cigar::op::{Kind, Op}; // RF pair: mate1 reverse (right side), mate2 forward (left side). // STAR reverse cluster: tlen = mate1.genome_end - mate2.genome_start = 1300 - 1000 = 300. @@ -1932,7 +1935,7 @@ mod tests { read_end: 100, i_frag: 0, }], - cigar: vec![CigarOp::Match(100)], + cigar: vec![Op::new(Kind::Match, 100)], score: 100, n_mismatch: 0, n_gap: 0, @@ -1954,7 +1957,7 @@ mod tests { read_end: 100, i_frag: 0, }], - cigar: vec![CigarOp::Match(100)], + cigar: vec![Op::new(Kind::Match, 100)], score: 100, n_mismatch: 0, n_gap: 0, @@ -1971,7 +1974,8 @@ mod tests { #[test] fn test_noncanonical_unannotated_filter() { use crate::align::score::SpliceMotif; - use crate::align::transcript::{CigarOp, Exon, Transcript}; + use crate::align::transcript::{Exon, Transcript}; + use cigar::op::{Kind, Op}; // Helper: check if a transcript would be filtered by RemoveNoncanonicalUnannotated // (mirrors the logic in the retain closure) @@ -1994,7 +1998,7 @@ mod tests { read_end: 100, i_frag: 0, }], - cigar: vec![CigarOp::Match(100)], + cigar: vec![Op::new(Kind::Match, 100)], score: 100, n_mismatch: 0, n_gap: 0, @@ -2065,7 +2069,8 @@ mod tests { #[test] fn test_paired_alignment_result_enum_variants() { - use crate::align::transcript::{CigarOp, Exon}; + use crate::align::transcript::Exon; + use cigar::op::{Kind, Op}; let transcript = Transcript { chr_idx: 0, @@ -2079,7 +2084,7 @@ mod tests { read_end: 100, i_frag: 0, }], - cigar: vec![CigarOp::Match(100)], + cigar: vec![Op::new(Kind::Match, 100)], score: 100, n_mismatch: 0, n_gap: 0, diff --git a/src/align/stitch.rs b/src/align/stitch.rs index 4d4e81b..c504964 100644 --- a/src/align/stitch.rs +++ b/src/align/stitch.rs @@ -1,9 +1,10 @@ -/// Seed clustering and stitching via dynamic programming +//! Seed clustering and stitching via dynamic programming use crate::align::score::AlignmentScorer; use crate::align::seed::Seed; -use crate::align::transcript::{CigarOp, Transcript}; +use crate::align::transcript::Transcript; use crate::error::Error; use crate::index::GenomeIndex; +use noodles::sam::alignment::record::cigar; /// STAR's MARK_FRAG_SPACER_BASE (IncludeDefine.h:174). /// Separates mate1 and mate2 fragments in the combined PE read. @@ -104,12 +105,13 @@ fn score_region( fn count_mismatches( read_seq: &[u8], - cigar_ops: &[CigarOp], + cigar_ops: &[cigar::Op], genome_start: u64, read_start: usize, index: &GenomeIndex, is_reverse: bool, ) -> u32 { + use cigar::op::Kind; // Add n_genome offset for reverse-strand genome access let genome_offset = if is_reverse { index.genome.n_genome } else { 0 }; @@ -118,9 +120,9 @@ fn count_mismatches( let mut genome_pos = genome_start; for op in cigar_ops { - match op { - CigarOp::Match(len) | CigarOp::Equal(len) | CigarOp::Diff(len) => { - for _i in 0..*len { + match op.kind() { + Kind::Match | Kind::SequenceMatch | Kind::SequenceMismatch => { + for _i in 0..op.len() { if read_pos < read_seq.len() { let read_base = read_seq[read_pos]; if let Some(genome_base) = index.genome.get_base(genome_pos + genome_offset) @@ -135,16 +137,13 @@ fn count_mismatches( genome_pos += 1; } } - CigarOp::Ins(len) => { - read_pos += *len as usize; + Kind::Insertion | Kind::SoftClip => { + read_pos += op.len(); } - CigarOp::Del(len) | CigarOp::RefSkip(len) => { - genome_pos += *len as u64; + Kind::Deletion | Kind::Skip => { + genome_pos += op.len() as u64; } - CigarOp::SoftClip(len) => { - read_pos += *len as usize; - } - CigarOp::HardClip(_) => {} + Kind::HardClip | Kind::Pad => {} } } @@ -1753,18 +1752,20 @@ pub(crate) fn finalize_transcript( } } + use cigar::op::{Kind, Op}; + // Build final CIGAR from exon blocks - let mut final_cigar: Vec = Vec::new(); + let mut final_cigar: Vec = Vec::new(); // Left soft clip let remaining_left_clip = alignment_start - left_extend.extend_len; if remaining_left_clip > 0 { - final_cigar.push(CigarOp::SoftClip(remaining_left_clip as u32)); + final_cigar.push(Op::new(Kind::SoftClip, remaining_left_clip)); } // Left extension match if left_extend.extend_len > 0 { - final_cigar.push(CigarOp::Match(left_extend.extend_len as u32)); + final_cigar.push(Op::new(Kind::Match, left_extend.extend_len)); } // Walk exon blocks to build CIGAR @@ -1776,69 +1777,83 @@ pub(crate) fn finalize_transcript( if genome_gap > read_gap && genome_gap > 0 { // Shared match bases before the gap - let shared = read_gap.max(0) as u32; + let shared = read_gap.max(0) as usize; if shared > 0 { - if let Some(CigarOp::Match(prev_len)) = final_cigar.last_mut() { - *prev_len += shared; + // TODO: replace with “extend_or_push_match” function call + if let Some(op) = final_cigar.last_mut() + && op.kind() == Kind::Match + { + *op = Op::new(Kind::Match, op.len() + shared); } else { - final_cigar.push(CigarOp::Match(shared)); + final_cigar.push(Op::new(Kind::Match, shared)); } } - let del = (genome_gap - read_gap.max(0)) as u32; - if del >= scorer.align_intron_min && del <= scorer.align_intron_max { - final_cigar.push(CigarOp::RefSkip(del)); + let del = (genome_gap - read_gap.max(0)) as usize; + if del >= scorer.align_intron_min as usize + && del <= scorer.align_intron_max as usize + { + final_cigar.push(Op::new(Kind::Skip, del)); } else { - final_cigar.push(CigarOp::Del(del)); + final_cigar.push(Op::new(Kind::Deletion, del)); } } else if read_gap > genome_gap && read_gap > 0 { // Insertion - let shared = genome_gap.max(0) as u32; + let shared = genome_gap.max(0) as usize; if shared > 0 { - if let Some(CigarOp::Match(prev_len)) = final_cigar.last_mut() { - *prev_len += shared; + // TODO: replace with “extend_or_push_match” function call + if let Some(op) = final_cigar.last_mut() + && op.kind() == Kind::Match + { + *op = Op::new(Kind::Match, op.len() + shared); } else { - final_cigar.push(CigarOp::Match(shared)); + final_cigar.push(Op::new(Kind::Match, shared)); } } - let ins = (read_gap - genome_gap.max(0)) as u32; - final_cigar.push(CigarOp::Ins(ins)); + let ins = (read_gap - genome_gap.max(0)) as usize; + final_cigar.push(Op::new(Kind::Insertion, ins)); } // Equal gap case is handled by extended exon blocks in stitch_align_to_transcript } // This exon's match region - let match_len = (exon.read_end - exon.read_start) as u32; + let match_len = exon.read_end - exon.read_start; if match_len > 0 { - if let Some(CigarOp::Match(prev_len)) = final_cigar.last_mut() { - *prev_len += match_len; + // TODO: replace with “extend_or_push_match” function call + if let Some(op) = final_cigar.last_mut() + && op.kind() == Kind::Match + { + *op = Op::new(Kind::Match, op.len() + match_len); } else { - final_cigar.push(CigarOp::Match(match_len)); + final_cigar.push(Op::new(Kind::Match, match_len)); } } } // Right extension match if right_extend.extend_len > 0 { - if let Some(CigarOp::Match(prev_len)) = final_cigar.last_mut() { - *prev_len += right_extend.extend_len as u32; + // TODO: replace with “extend_or_push_match” function call + if let Some(op) = final_cigar.last_mut() + && op.kind() == Kind::Match + { + *op = Op::new(Kind::Match, op.len() + right_extend.extend_len); } else { - final_cigar.push(CigarOp::Match(right_extend.extend_len as u32)); + final_cigar.push(Op::new(Kind::Match, right_extend.extend_len)); } } // Right soft clip let remaining_right_clip = (read_seq.len() - alignment_end) - right_extend.extend_len; if remaining_right_clip > 0 { - final_cigar.push(CigarOp::SoftClip(remaining_right_clip as u32)); + final_cigar.push(Op::new(Kind::SoftClip, remaining_right_clip)); } // Validate CIGAR read-consuming length - let cigar_read_len: u32 = final_cigar + let cigar_read_len: usize = final_cigar .iter() - .filter(|op| op.consumes_query()) + .filter(|op| op.kind().consumes_read()) .map(|op| op.len()) .sum(); - if cigar_read_len != read_seq.len() as u32 { + if cigar_read_len != read_seq.len() { // Invalid CIGAR: exon block geometry is inconsistent with read length. // This can occur when a combined PE read's WorkingTranscript has exon // positions that span both mates or include the spacer region. Silently @@ -1870,13 +1885,13 @@ pub(crate) fn finalize_transcript( // Compute total reference-consuming length from CIGAR let mut ref_len = 0u64; for op in &final_cigar { - match op { - CigarOp::Match(len) - | CigarOp::Equal(len) - | CigarOp::Diff(len) - | CigarOp::Del(len) - | CigarOp::RefSkip(len) => { - ref_len += *len as u64; + match op.kind() { + Kind::Match + | Kind::SequenceMatch + | Kind::SequenceMismatch + | Kind::Deletion + | Kind::Skip => { + ref_len += op.len() as u64; } _ => {} } @@ -1898,9 +1913,9 @@ pub(crate) fn finalize_transcript( let mut genome_pos_e = forward_genome_start; for op in &final_cigar { - match op { - CigarOp::Match(len) | CigarOp::Equal(len) | CigarOp::Diff(len) => { - let len = *len as usize; + match op.kind() { + Kind::Match | Kind::SequenceMatch | Kind::SequenceMismatch => { + let len = op.len(); exons.push(Exon { genome_start: genome_pos_e, genome_end: genome_pos_e + len as u64, @@ -1913,19 +1928,13 @@ pub(crate) fn finalize_transcript( read_pos_e += len; genome_pos_e += len as u64; } - CigarOp::Ins(len) => { - read_pos_e += *len as usize; - } - CigarOp::Del(len) => { - genome_pos_e += *len as u64; - } - CigarOp::RefSkip(len) => { - genome_pos_e += *len as u64; + Kind::Insertion | Kind::SoftClip => { + read_pos_e += op.len(); } - CigarOp::SoftClip(len) => { - read_pos_e += *len as usize; + Kind::Deletion | Kind::Skip => { + genome_pos_e += op.len() as u64; } - CigarOp::HardClip(_) => {} + Kind::HardClip | Kind::Pad => {} } } diff --git a/src/align/transcript.rs b/src/align/transcript.rs index 3be2959..ea53c3e 100644 --- a/src/align/transcript.rs +++ b/src/align/transcript.rs @@ -1,4 +1,5 @@ -/// Transcript data structures for storing alignment results +//! Transcript data structures for storing alignment results +use noodles::sam::alignment::record::cigar; use std::fmt::Write as _; /// A complete alignment of a read to the genome @@ -15,7 +16,7 @@ pub struct Transcript { /// Exon segments pub exons: Vec, /// CIGAR operations - pub cigar: Vec, + pub cigar: Vec, /// Alignment score pub score: i32, /// Number of mismatches @@ -54,90 +55,25 @@ pub struct Exon { pub i_frag: u8, } -/// CIGAR operation -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] -pub enum CigarOp { - /// M: match/mismatch (default mode) - Match(u32), - /// =: exact match (optional) - Equal(u32), - /// X: mismatch (optional) - Diff(u32), - /// I: insertion to reference - Ins(u32), - /// D: deletion from reference - Del(u32), - /// N: splice junction (skipped reference region) - RefSkip(u32), - /// S: soft clip (clipped sequence present in read) - SoftClip(u32), - /// H: hard clip (clipped sequence not present) - HardClip(u32), -} - -impl CigarOp { - /// Get the operation character - pub fn op_char(&self) -> char { - match self { - CigarOp::Match(_) => 'M', - CigarOp::Equal(_) => '=', - CigarOp::Diff(_) => 'X', - CigarOp::Ins(_) => 'I', - CigarOp::Del(_) => 'D', - CigarOp::RefSkip(_) => 'N', - CigarOp::SoftClip(_) => 'S', - CigarOp::HardClip(_) => 'H', - } - } - - /// Get the operation length - pub fn len(&self) -> u32 { - match self { - CigarOp::Match(n) - | CigarOp::Equal(n) - | CigarOp::Diff(n) - | CigarOp::Ins(n) - | CigarOp::Del(n) - | CigarOp::RefSkip(n) - | CigarOp::SoftClip(n) - | CigarOp::HardClip(n) => *n, - } - } - - /// Check if operation is empty - pub fn is_empty(&self) -> bool { - self.len() == 0 - } - - /// Check if operation consumes query bases - pub fn consumes_query(&self) -> bool { - matches!( - self, - CigarOp::Match(_) - | CigarOp::Equal(_) - | CigarOp::Diff(_) - | CigarOp::Ins(_) - | CigarOp::SoftClip(_) - ) - } - - /// Check if operation consumes reference bases - pub fn consumes_reference(&self) -> bool { - matches!( - self, - CigarOp::Match(_) - | CigarOp::Equal(_) - | CigarOp::Diff(_) - | CigarOp::Del(_) - | CigarOp::RefSkip(_) - ) +fn cigar_char(kind: cigar::op::Kind) -> char { + use cigar::op::Kind; + match kind { + Kind::Match => 'M', + Kind::Insertion => 'I', + Kind::Deletion => 'D', + Kind::Skip => 'N', + Kind::SoftClip => 'S', + Kind::HardClip => 'H', + Kind::Pad => 'P', + Kind::SequenceMatch => '=', + Kind::SequenceMismatch => 'X', } } /// Convert CIGAR operations to CIGAR string -pub(crate) fn cigar_to_string(cigar: &[CigarOp]) -> String { +pub(crate) fn cigar_to_string(cigar: &[cigar::Op]) -> String { cigar.iter().fold(String::new(), |mut c, op| { - let _ = write!(c, "{}{}", op.len(), op.op_char()); // infallible + let _ = write!(c, "{}{}", op.len(), cigar_char(op.kind())); // infallible c }) } @@ -149,52 +85,43 @@ impl Transcript { } /// Calculate number of matched bases (for filtering) - pub fn n_matched(&self) -> u32 { + pub fn n_matched(&self) -> usize { + use cigar::op::Kind; self.cigar .iter() - .filter_map(|op| match op { - CigarOp::Match(n) | CigarOp::Equal(n) => Some(*n), + .filter_map(|op| match op.kind() { + Kind::Match | Kind::SequenceMatch => Some(op.len()), _ => None, }) .sum() } /// Calculate read length from CIGAR - pub fn read_length(&self) -> u32 { + pub fn read_length(&self) -> usize { self.cigar .iter() - .filter(|op| op.consumes_query()) + .filter(|op| op.kind().consumes_read()) .map(|op| op.len()) .sum() } /// Calculate reference length from CIGAR - pub fn reference_length(&self) -> u32 { + pub fn reference_length(&self) -> usize { self.cigar .iter() - .filter(|op| op.consumes_reference()) + .filter(|op| op.kind().consumes_reference()) .map(|op| op.len()) .sum() } /// Count soft-clipped bases on left and right ends /// - /// Returns (left_clip, right_clip) in bases - pub fn count_soft_clips(&self) -> (u32, u32) { - let mut left_clip = 0u32; - let mut right_clip = 0u32; - - // Check first operation for left clip - if let Some(CigarOp::SoftClip(n)) = self.cigar.first() { - left_clip = *n; - } - - // Check last operation for right clip - if let Some(CigarOp::SoftClip(n)) = self.cigar.last() { - right_clip = *n; - } - - (left_clip, right_clip) + /// Returns `[left_clip, right_clip]` in bases + pub fn count_soft_clips(&self) -> [usize; 2] { + [self.cigar.first(), self.cigar.last()].map(|c| { + c.filter(|c| c.kind() == cigar::op::Kind::SoftClip) + .map_or(0, |c| c.len()) + }) } } @@ -204,12 +131,13 @@ mod tests { #[test] fn test_cigar_to_string() { + use cigar::op::{Kind, Op}; let cigar = vec![ - CigarOp::Match(50), - CigarOp::Ins(2), - CigarOp::Del(3), - CigarOp::RefSkip(1000), - CigarOp::SoftClip(5), + Op::new(Kind::Match, 50), + Op::new(Kind::Insertion, 2), + Op::new(Kind::Deletion, 3), + Op::new(Kind::Skip, 1000), + Op::new(Kind::SoftClip, 5), ]; assert_eq!(cigar_to_string(&cigar), "50M2I3D1000N5S"); @@ -217,6 +145,7 @@ mod tests { #[test] fn test_cigar_string() { + use cigar::op::{Kind, Op}; let transcript = Transcript { chr_idx: 0, genome_start: 100, @@ -224,9 +153,9 @@ mod tests { is_reverse: false, exons: vec![], cigar: vec![ - CigarOp::Match(50), - CigarOp::RefSkip(100), - CigarOp::Match(50), + Op::new(Kind::Match, 50), + Op::new(Kind::Skip, 100), + Op::new(Kind::Match, 50), ], score: 100, n_mismatch: 0, @@ -242,24 +171,27 @@ mod tests { #[test] fn test_cigar_consumes() { - assert!(CigarOp::Match(10).consumes_query()); - assert!(CigarOp::Match(10).consumes_reference()); + use cigar::op::Kind; + + assert!(Kind::Match.consumes_read()); + assert!(Kind::Match.consumes_reference()); - assert!(CigarOp::Ins(5).consumes_query()); - assert!(!CigarOp::Ins(5).consumes_reference()); + assert!(Kind::Insertion.consumes_read()); + assert!(!Kind::Insertion.consumes_reference()); - assert!(!CigarOp::Del(3).consumes_query()); - assert!(CigarOp::Del(3).consumes_reference()); + assert!(!Kind::Deletion.consumes_read()); + assert!(Kind::Deletion.consumes_reference()); - assert!(!CigarOp::RefSkip(1000).consumes_query()); - assert!(CigarOp::RefSkip(1000).consumes_reference()); + assert!(!Kind::Skip.consumes_read()); + assert!(Kind::Skip.consumes_reference()); - assert!(CigarOp::SoftClip(5).consumes_query()); - assert!(!CigarOp::SoftClip(5).consumes_reference()); + assert!(Kind::SoftClip.consumes_read()); + assert!(!Kind::SoftClip.consumes_reference()); } #[test] fn test_transcript_lengths() { + use cigar::op::{Kind, Op}; let transcript = Transcript { chr_idx: 0, genome_start: 100, @@ -267,11 +199,11 @@ mod tests { is_reverse: false, exons: vec![], cigar: vec![ - CigarOp::Match(45), - CigarOp::Ins(3), - CigarOp::Match(2), - CigarOp::RefSkip(100), - CigarOp::Match(50), + Op::new(Kind::Match, 45), + Op::new(Kind::Insertion, 3), + Op::new(Kind::Match, 2), + Op::new(Kind::Skip, 100), + Op::new(Kind::Match, 50), ], score: 100, n_mismatch: 0, @@ -310,6 +242,7 @@ mod tests { #[test] fn test_count_soft_clips_both_ends() { + use cigar::op::{Kind, Op}; let transcript = Transcript { chr_idx: 0, genome_start: 100, @@ -317,9 +250,9 @@ mod tests { is_reverse: false, exons: vec![], cigar: vec![ - CigarOp::SoftClip(10), - CigarOp::Match(50), - CigarOp::SoftClip(15), + Op::new(Kind::SoftClip, 10), + Op::new(Kind::Match, 50), + Op::new(Kind::SoftClip, 15), ], score: 100, n_mismatch: 0, @@ -330,20 +263,21 @@ mod tests { read_seq: vec![], }; - let (left, right) = transcript.count_soft_clips(); + let [left, right] = transcript.count_soft_clips(); assert_eq!(left, 10); assert_eq!(right, 15); } #[test] fn test_count_soft_clips_left_only() { + use cigar::op::{Kind, Op}; let transcript = Transcript { chr_idx: 0, genome_start: 100, genome_end: 150, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::SoftClip(20), CigarOp::Match(50)], + cigar: vec![Op::new(Kind::SoftClip, 20), Op::new(Kind::Match, 50)], score: 100, n_mismatch: 0, n_gap: 0, @@ -353,20 +287,21 @@ mod tests { read_seq: vec![], }; - let (left, right) = transcript.count_soft_clips(); + let [left, right] = transcript.count_soft_clips(); assert_eq!(left, 20); assert_eq!(right, 0); } #[test] fn test_count_soft_clips_none() { + use cigar::op::{Kind, Op}; let transcript = Transcript { chr_idx: 0, genome_start: 100, genome_end: 150, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(50)], + cigar: vec![Op::new(Kind::Match, 50)], score: 100, n_mismatch: 0, n_gap: 0, @@ -376,7 +311,7 @@ mod tests { read_seq: vec![], }; - let (left, right) = transcript.count_soft_clips(); + let [left, right] = transcript.count_soft_clips(); assert_eq!(left, 0); assert_eq!(right, 0); } diff --git a/src/chimeric/detect.rs b/src/chimeric/detect.rs index 7127ba5..034750e 100644 --- a/src/chimeric/detect.rs +++ b/src/chimeric/detect.rs @@ -43,7 +43,7 @@ impl<'a> ChimericDetector<'a> { } let read_len = read_seq.len(); - let (left_clip, right_clip) = transcript.count_soft_clips(); + let [left_clip, right_clip] = transcript.count_soft_clips(); let score_min = params.chim_score_min; let score_drop_max = params.chim_score_drop_max; let non_gtag_penalty = params.chim_score_junction_non_gtag; @@ -51,7 +51,7 @@ impl<'a> ChimericDetector<'a> { let overhang_min = params.chim_junction_overhang_min as usize; // Try right clip first, then left (match STAR's ordering) - let candidates = [(right_clip as usize, true), (left_clip as usize, false)]; + let candidates = [(right_clip, true), (left_clip, false)]; for (clip_len, is_right) in candidates { if clip_len < min_seg { @@ -980,7 +980,7 @@ pub(crate) fn transcript_to_segment(transcript: &Transcript) -> Result Transcript { + use cigar::op::{Kind, Op}; let read_len = (genome_end - genome_start) as usize; Transcript { chr_idx, @@ -1048,7 +1050,7 @@ mod tests { read_end: read_len, i_frag: 0, }], - cigar: vec![CigarOp::Match(read_len as u32)], + cigar: vec![Op::new(Kind::Match, read_len)], score: read_len as i32, n_mismatch: 0, n_gap: 0, @@ -1290,14 +1292,15 @@ mod tests { left_clip: usize, right_clip: usize, ) -> Transcript { + use cigar::op::{Kind, Op}; let aligned_len = read_len - left_clip - right_clip; let mut cigar = vec![]; if left_clip > 0 { - cigar.push(CigarOp::SoftClip(left_clip as u32)); + cigar.push(Op::new(Kind::SoftClip, left_clip)); } - cigar.push(CigarOp::Match(aligned_len as u32)); + cigar.push(Op::new(Kind::Match, aligned_len)); if right_clip > 0 { - cigar.push(CigarOp::SoftClip(right_clip as u32)); + cigar.push(Op::new(Kind::SoftClip, right_clip)); } Transcript { chr_idx, diff --git a/src/chimeric/output.rs b/src/chimeric/output.rs index 422f0d2..f7afce5 100644 --- a/src/chimeric/output.rs +++ b/src/chimeric/output.rs @@ -180,7 +180,6 @@ fn build_segment_record( sa_tag: &str, ) -> Result { use crate::io::fastq::{complement_base, decode_base}; - use crate::io::sam::convert_cigar; use noodles::sam::alignment::record::data::field::Tag; let mut record = RecordBuf::default(); @@ -206,7 +205,7 @@ fn build_segment_record( *record.mapping_quality_mut() = MappingQuality::new(mapq); - *record.cigar_mut() = convert_cigar(&seg.cigar)?; + *record.cigar_mut() = seg.cigar.iter().copied().collect(); // Primary record carries the full read sequence; supplementary uses * (empty). if !is_supplementary { @@ -236,8 +235,8 @@ fn build_segment_record( #[cfg(test)] mod tests { use super::*; - use crate::align::transcript::CigarOp; use crate::chimeric::segment::{ChimericAlignment, ChimericSegment}; + use noodles::sam::alignment::record::cigar; use std::io::Read; use tempfile::tempdir; @@ -256,6 +255,7 @@ mod tests { #[test] fn test_write_inter_chromosomal() { + use cigar::op::{Kind, Op}; let dir = tempdir().unwrap(); let prefix = dir.path().to_str().unwrap(); @@ -269,7 +269,7 @@ mod tests { is_reverse: false, read_start: 0, read_end: 63, - cigar: vec![CigarOp::Match(63)], + cigar: vec![Op::new(Kind::Match, 63)], score: 100, n_mismatch: 2, }; @@ -281,7 +281,7 @@ mod tests { is_reverse: false, read_start: 63, read_end: 100, - cigar: vec![CigarOp::Match(37)], + cigar: vec![Op::new(Kind::Match, 37)], score: 80, n_mismatch: 1, }; @@ -335,6 +335,7 @@ mod tests { #[test] fn test_write_strand_break() { + use cigar::op::{Kind, Op}; let dir = tempdir().unwrap(); let prefix = dir.path().to_str().unwrap(); @@ -348,7 +349,7 @@ mod tests { is_reverse: false, read_start: 0, read_end: 50, - cigar: vec![CigarOp::Match(50)], + cigar: vec![Op::new(Kind::Match, 50)], score: 100, n_mismatch: 1, }; @@ -360,7 +361,7 @@ mod tests { is_reverse: true, read_start: 50, read_end: 100, - cigar: vec![CigarOp::Match(50)], + cigar: vec![Op::new(Kind::Match, 50)], score: 100, n_mismatch: 1, }; @@ -419,6 +420,7 @@ mod tests { #[test] fn test_within_bam_returns_two_records() { + use cigar::op::{Kind, Op}; let donor = ChimericSegment { chr_idx: 0, genome_start: 100, @@ -426,7 +428,7 @@ mod tests { is_reverse: false, read_start: 0, read_end: 63, - cigar: vec![CigarOp::Match(63)], + cigar: vec![Op::new(Kind::Match, 63)], score: 63, n_mismatch: 0, }; @@ -437,7 +439,7 @@ mod tests { is_reverse: false, read_start: 63, read_end: 100, - cigar: vec![CigarOp::Match(37)], + cigar: vec![Op::new(Kind::Match, 37)], score: 37, n_mismatch: 1, }; @@ -458,6 +460,7 @@ mod tests { #[test] fn test_within_bam_donor_not_supplementary() { + use cigar::op::{Kind, Op}; let donor = ChimericSegment { chr_idx: 0, genome_start: 100, @@ -465,7 +468,7 @@ mod tests { is_reverse: false, read_start: 0, read_end: 63, - cigar: vec![CigarOp::Match(63)], + cigar: vec![Op::new(Kind::Match, 63)], score: 63, n_mismatch: 0, }; @@ -476,7 +479,7 @@ mod tests { is_reverse: false, read_start: 63, read_end: 100, - cigar: vec![CigarOp::Match(37)], + cigar: vec![Op::new(Kind::Match, 37)], score: 37, n_mismatch: 1, }; @@ -507,6 +510,7 @@ mod tests { #[test] fn test_within_bam_sa_tag_format() { + use cigar::op::{Kind, Op}; use noodles::sam::alignment::record::data::field::Tag; let donor = ChimericSegment { chr_idx: 0, @@ -515,7 +519,7 @@ mod tests { is_reverse: false, read_start: 0, read_end: 63, - cigar: vec![CigarOp::Match(63)], + cigar: vec![Op::new(Kind::Match, 63)], score: 63, n_mismatch: 2, }; @@ -526,7 +530,7 @@ mod tests { is_reverse: true, read_start: 63, read_end: 100, - cigar: vec![CigarOp::Match(37)], + cigar: vec![Op::new(Kind::Match, 37)], score: 37, n_mismatch: 1, }; @@ -571,6 +575,7 @@ mod tests { #[test] fn test_within_bam_donor_has_sequence() { + use cigar::op::{Kind, Op}; let donor = ChimericSegment { chr_idx: 0, genome_start: 100, @@ -578,7 +583,7 @@ mod tests { is_reverse: false, read_start: 0, read_end: 63, - cigar: vec![CigarOp::Match(63)], + cigar: vec![Op::new(Kind::Match, 63)], score: 63, n_mismatch: 0, }; @@ -589,7 +594,7 @@ mod tests { is_reverse: false, read_start: 63, read_end: 100, - cigar: vec![CigarOp::Match(37)], + cigar: vec![Op::new(Kind::Match, 37)], score: 37, n_mismatch: 0, }; diff --git a/src/chimeric/segment.rs b/src/chimeric/segment.rs index 9a5d0a9..78c5ecc 100644 --- a/src/chimeric/segment.rs +++ b/src/chimeric/segment.rs @@ -1,6 +1,7 @@ -// ChimericSegment and ChimericAlignment data structures +//! ChimericSegment and ChimericAlignment data structures -use crate::align::transcript::{CigarOp, cigar_to_string}; +use crate::align::transcript::cigar_to_string; +use noodles::sam::alignment::record::cigar; /// A single segment of a chimeric alignment #[derive(Debug, Clone)] @@ -11,7 +12,7 @@ pub struct ChimericSegment { pub is_reverse: bool, pub read_start: usize, pub read_end: usize, - pub cigar: Vec, + pub cigar: Vec, pub score: i32, pub n_mismatch: u32, } @@ -117,6 +118,7 @@ impl ChimericAlignment { #[cfg(test)] mod tests { use super::*; + use cigar::op::{Kind, Op}; fn mock_segment( chr_idx: usize, @@ -132,7 +134,7 @@ mod tests { is_reverse: false, read_start, read_end, - cigar: vec![CigarOp::Match(50)], + cigar: vec![Op::new(Kind::Match, 50)], score: 100, n_mismatch: 2, } diff --git a/src/io/bam.rs b/src/io/bam.rs index 2ac0517..9a674ec 100644 --- a/src/io/bam.rs +++ b/src/io/bam.rs @@ -461,8 +461,9 @@ impl SortedBamStdoutWriter { #[cfg(test)] mod tests { use super::*; - use crate::align::transcript::{CigarOp, Exon, Transcript}; + use crate::align::transcript::{Exon, Transcript}; use clap::Parser; + use noodles::sam::alignment::record::cigar; use tempfile::NamedTempFile; fn create_test_genome() -> Genome { @@ -518,6 +519,8 @@ mod tests { #[test] fn test_bam_alignment_write() { + use cigar::op::{Kind, Op}; + let genome = create_test_genome(); let params = create_test_params(); let temp_file = NamedTempFile::new().unwrap(); @@ -537,7 +540,7 @@ mod tests { read_end: 4, i_frag: 0, }], - cigar: vec![CigarOp::Match(4)], + cigar: vec![Op::new(Kind::Match, 4)], score: 0, n_mismatch: 0, n_gap: 0, diff --git a/src/io/sam.rs b/src/io/sam.rs index 66a23a8..19ec856 100644 --- a/src/io/sam.rs +++ b/src/io/sam.rs @@ -1,6 +1,6 @@ -/// SAM/BAM output writer with noodles +//! SAM/BAM output writer with noodles use crate::align::read_align::PairedAlignment; -use crate::align::transcript::{CigarOp, Transcript}; +use crate::align::transcript::Transcript; use crate::error::Error; use crate::genome::Genome; use crate::io::fastq::{complement_base, decode_base}; @@ -11,6 +11,7 @@ use bstr::BString; use noodles::sam; use noodles::sam::alignment::io::Write; use noodles::sam::alignment::record::MappingQuality; +use noodles::sam::alignment::record::cigar; use noodles::sam::alignment::record::data::field::Tag; use noodles::sam::alignment::record_buf::data::field::Value; use noodles::sam::alignment::record_buf::data::field::value::Array; @@ -424,7 +425,7 @@ impl SamWriter { Error::Alignment(format!("invalid position {}: {}", mapped_pos, e)) })?); *mapped_rec.mapping_quality_mut() = MappingQuality::new(mapq); - *mapped_rec.cigar_mut() = convert_cigar(&mapped_transcript.cigar)?; + *mapped_rec.cigar_mut() = mapped_transcript.cigar.iter().copied().collect(); // RNEXT = own chr, PNEXT = own pos (STAR convention for unmapped mate) *mapped_rec.mate_reference_sequence_id_mut() = Some(mapped_transcript.chr_idx); @@ -614,7 +615,7 @@ impl SamWriter { *record.mapping_quality_mut() = MappingQuality::new(mapq); // CIGAR (already has N ops stripped by align_to_transcripts) - *record.cigar_mut() = convert_cigar(&t.cigar)?; + *record.cigar_mut() = t.cigar.iter().copied().collect(); // SEQ / QUAL — STAR writes the original-orientation sequence when // FLAG 0x10 is unset (forward alignment in t-space) and RC'd seq @@ -910,8 +911,7 @@ fn transcript_to_record( *record.mapping_quality_mut() = MappingQuality::new(mapq); // CIGAR - let cigar = convert_cigar(&transcript.cigar)?; - *record.cigar_mut() = cigar; + *record.cigar_mut() = transcript.cigar.iter().copied().collect(); // Sequence and quality scores // Per SAM spec: when FLAG & 16 (reverse strand), SEQ is the reverse complement @@ -980,15 +980,16 @@ fn transcript_to_record( /// Not emitted in SAM output (STAR maps NM attribute to 'nM' tag), but kept for tests. #[cfg(test)] fn compute_edit_distance(transcript: &Transcript) -> i32 { - let indel_bases: u32 = transcript + use cigar::op::Kind; + let indel_bases: usize = transcript .cigar .iter() - .filter_map(|op| match op { - CigarOp::Ins(n) | CigarOp::Del(n) => Some(*n), + .filter_map(|op| match op.kind() { + Kind::Insertion | Kind::Deletion => Some(op.len()), _ => None, }) .sum(); - (transcript.n_mismatch + indel_bases) as i32 + (transcript.n_mismatch + indel_bases as u32) as i32 } /// Derive XS strand tag from transcript junction motifs. @@ -1038,22 +1039,23 @@ fn build_jm_tag(transcript: &Transcript) -> Option { /// Format: [start1, end1, start2, end2, ...] where start is first intronic base /// and end is last intronic base (both 1-based, inclusive). fn build_ji_tag(transcript: &Transcript, chr_start: u64) -> Option { + use cigar::op::Kind; if transcript.n_junction == 0 { return None; } let mut coords: Vec = Vec::new(); let mut genome_pos = transcript.genome_start; for op in &transcript.cigar { - match op { - CigarOp::RefSkip(n) => { + match op.kind() { + Kind::Skip => { let intron_start = (genome_pos - chr_start + 1) as i32; // 1-based - let intron_end = (genome_pos + *n as u64 - chr_start) as i32; // 1-based inclusive + let intron_end = (genome_pos + op.len() as u64 - chr_start) as i32; // 1-based inclusive coords.push(intron_start); coords.push(intron_end); - genome_pos += *n as u64; + genome_pos += op.len() as u64; } - CigarOp::Match(n) | CigarOp::Equal(n) | CigarOp::Diff(n) | CigarOp::Del(n) => { - genome_pos += *n as u64; + Kind::Match | Kind::SequenceMatch | Kind::SequenceMismatch | Kind::Deletion => { + genome_pos += op.len() as u64; } _ => {} // Ins, SoftClip, HardClip don't consume reference } @@ -1071,6 +1073,8 @@ fn build_md_tag( genome: &Genome, is_reverse: bool, ) -> String { + use cigar::op::Kind; + // Build the SAM-order sequence (RC for reverse strand) let sam_seq: Vec = if is_reverse { read_seq.iter().rev().map(|&b| complement_base(b)).collect() @@ -1084,9 +1088,9 @@ fn build_md_tag( let mut read_pos: usize = 0; for op in &transcript.cigar { - match op { - CigarOp::Match(n) | CigarOp::Equal(n) | CigarOp::Diff(n) => { - for _ in 0..*n { + match op.kind() { + Kind::Match | Kind::SequenceMatch | Kind::SequenceMismatch => { + for _ in 0..op.len() { let ref_base = genome.get_base(genome_pos).unwrap_or(4); let read_base = if read_pos < sam_seq.len() { sam_seq[read_pos] @@ -1104,26 +1108,23 @@ fn build_md_tag( read_pos += 1; } } - CigarOp::Del(n) => { + Kind::Deletion => { write!(md, "{}", match_count).unwrap(); match_count = 0; md.push('^'); - for _ in 0..*n { + for _ in 0..op.len() { let ref_base = genome.get_base(genome_pos).unwrap_or(4); md.push(decode_base(ref_base) as char); genome_pos += 1; } } - CigarOp::Ins(n) => { - read_pos += *n as usize; - } - CigarOp::RefSkip(n) => { - genome_pos += *n as u64; + Kind::Insertion | Kind::SoftClip => { + read_pos += op.len(); } - CigarOp::SoftClip(n) => { - read_pos += *n as usize; + Kind::Skip => { + genome_pos += op.len() as u64; } - CigarOp::HardClip(_) => {} + Kind::HardClip | Kind::Pad => {} } } // Emit trailing match count @@ -1131,32 +1132,6 @@ fn build_md_tag( md } -/// Convert rustar-aligner CigarOp to noodles Cigar -pub(crate) fn convert_cigar(ops: &[CigarOp]) -> Result { - use sam::alignment::record::cigar::op::Kind; - - let mut cigar = sam::alignment::record_buf::Cigar::default(); - - for op in ops { - let kind = match op { - CigarOp::Match(_) => Kind::Match, - CigarOp::Equal(_) => Kind::SequenceMatch, - CigarOp::Diff(_) => Kind::SequenceMismatch, - CigarOp::Ins(_) => Kind::Insertion, - CigarOp::Del(_) => Kind::Deletion, - CigarOp::RefSkip(_) => Kind::Skip, - CigarOp::SoftClip(_) => Kind::SoftClip, - CigarOp::HardClip(_) => Kind::HardClip, - }; - - let len = op.len() as usize; - let noodles_op = sam::alignment::record::cigar::Op::new(kind, len); - cigar.as_mut().push(noodles_op); - } - - Ok(cigar) -} - /// Build a SAM record for one mate of a paired-end read #[allow(clippy::too_many_arguments)] fn build_paired_mate_record( @@ -1231,8 +1206,7 @@ fn build_paired_mate_record( *record.mapping_quality_mut() = MappingQuality::new(mapq); // CIGAR - let cigar = convert_cigar(&transcript.cigar)?; - *record.cigar_mut() = cigar; + *record.cigar_mut() = transcript.cigar.iter().copied().collect(); // RNEXT (mate reference sequence from the mate's actual alignment) *record.mate_reference_sequence_id_mut() = Some(mate_transcript.chr_idx); @@ -1315,6 +1289,7 @@ mod tests { use crate::align::score::SpliceMotif; use crate::genome::Genome; use clap::Parser; + use noodles::sam::alignment::record::cigar; use tempfile::NamedTempFile; /// Build an attribute set with all tags enabled (for tests that don't care about filtering) @@ -1344,20 +1319,6 @@ mod tests { } } - #[test] - fn test_convert_cigar() { - let ops = vec![ - CigarOp::Match(50), - CigarOp::Ins(3), - CigarOp::Del(2), - CigarOp::RefSkip(100), - CigarOp::Match(50), - ]; - - let cigar = convert_cigar(&ops).unwrap(); - assert_eq!(cigar.as_ref().len(), 5); - } - #[test] fn test_build_sam_header() { let genome = make_test_genome(); @@ -1401,6 +1362,7 @@ mod tests { #[test] fn test_sam_output_includes_rg_header_and_tag() { use crate::align::transcript::Exon; + use cigar::op::{Kind, Op}; use std::io::Read; let genome = make_test_genome(); @@ -1428,7 +1390,7 @@ mod tests { read_end: 4, i_frag: 0, }], - cigar: vec![CigarOp::Match(4)], + cigar: vec![Op::new(Kind::Match, 4)], score: 100, n_mismatch: 0, n_gap: 0, @@ -1478,6 +1440,7 @@ mod tests { #[test] fn test_transcript_to_record() { + use cigar::op::{Kind, Op}; let genome = make_test_genome(); let transcript = Transcript { @@ -1486,7 +1449,7 @@ mod tests { genome_end: 60, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(50)], + cigar: vec![Op::new(Kind::Match, 50)], score: 100, n_mismatch: 2, n_gap: 0, @@ -1569,6 +1532,7 @@ mod tests { #[test] fn test_build_paired_mate_record_flags() { use crate::align::transcript::Exon; + use cigar::op::{Kind, Op}; let genome = make_test_genome(); @@ -1584,7 +1548,7 @@ mod tests { read_end: 4, i_frag: 0, }], - cigar: vec![CigarOp::Match(4)], + cigar: vec![Op::new(Kind::Match, 4)], score: 100, n_mismatch: 0, n_gap: 0, @@ -1606,7 +1570,7 @@ mod tests { read_end: 3, i_frag: 0, }], - cigar: vec![CigarOp::Match(3)], + cigar: vec![Op::new(Kind::Match, 3)], score: 90, n_mismatch: 1, n_gap: 0, @@ -1679,6 +1643,7 @@ mod tests { #[test] fn test_build_paired_mate_record_mate_fields() { use crate::align::transcript::Exon; + use cigar::op::{Kind, Op}; let genome = make_test_genome(); @@ -1695,7 +1660,7 @@ mod tests { read_end: 4, i_frag: 0, }], - cigar: vec![CigarOp::Match(4)], + cigar: vec![Op::new(Kind::Match, 4)], score: 200, n_mismatch: 0, n_gap: 0, @@ -1718,7 +1683,7 @@ mod tests { read_end: 3, i_frag: 0, }], - cigar: vec![CigarOp::Match(3)], + cigar: vec![Op::new(Kind::Match, 3)], score: 150, n_mismatch: 1, n_gap: 0, @@ -1774,6 +1739,7 @@ mod tests { #[test] fn test_tags_nh_hi_as_nm() { + use cigar::op::{Kind, Op}; let genome = make_test_genome(); // Transcript with 2 mismatches and a 3bp deletion → NM = 2 + 3 = 5 @@ -1783,7 +1749,11 @@ mod tests { genome_end: 60, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(20), CigarOp::Del(3), CigarOp::Match(30)], + cigar: vec![ + Op::new(Kind::Match, 20), + Op::new(Kind::Deletion, 3), + Op::new(Kind::Match, 30), + ], score: 100, n_mismatch: 2, n_gap: 1, @@ -1843,6 +1813,7 @@ mod tests { #[test] fn test_edit_distance_computation() { + use cigar::op::{Kind, Op}; // Pure match: NM = n_mismatch only let t1 = Transcript { chr_idx: 0, @@ -1850,7 +1821,7 @@ mod tests { genome_end: 50, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(50)], + cigar: vec![Op::new(Kind::Match, 50)], score: 100, n_mismatch: 3, n_gap: 0, @@ -1869,11 +1840,11 @@ mod tests { is_reverse: false, exons: vec![], cigar: vec![ - CigarOp::Match(20), - CigarOp::Ins(5), - CigarOp::Match(10), - CigarOp::Del(7), - CigarOp::Match(20), + Op::new(Kind::Match, 20), + Op::new(Kind::Insertion, 5), + Op::new(Kind::Match, 10), + Op::new(Kind::Deletion, 7), + Op::new(Kind::Match, 20), ], score: 80, n_mismatch: 1, @@ -1893,9 +1864,9 @@ mod tests { is_reverse: false, exons: vec![], cigar: vec![ - CigarOp::Match(25), - CigarOp::RefSkip(1000), - CigarOp::Match(25), + Op::new(Kind::Match, 25), + Op::new(Kind::Skip, 1000), + Op::new(Kind::Match, 25), ], score: 50, n_mismatch: 0, @@ -1915,9 +1886,9 @@ mod tests { is_reverse: false, exons: vec![], cigar: vec![ - CigarOp::SoftClip(10), - CigarOp::Match(40), - CigarOp::SoftClip(10), + Op::new(Kind::SoftClip, 10), + Op::new(Kind::Match, 40), + Op::new(Kind::SoftClip, 10), ], score: 40, n_mismatch: 2, @@ -1932,6 +1903,7 @@ mod tests { #[test] fn test_transcript_to_record_has_tags() { + use cigar::op::{Kind, Op}; // Verify the existing test_transcript_to_record scenario also has tags let genome = make_test_genome(); @@ -1941,7 +1913,7 @@ mod tests { genome_end: 60, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(50)], + cigar: vec![Op::new(Kind::Match, 50)], score: 100, n_mismatch: 2, n_gap: 0, @@ -1985,6 +1957,7 @@ mod tests { #[test] fn test_secondary_flag() { + use cigar::op::{Kind, Op}; let genome = make_test_genome(); let params = Parameters::parse_from(vec!["rustar-aligner", "--readFilesIn", "test.fq"]); @@ -1995,7 +1968,7 @@ mod tests { genome_end: 50, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(50)], + cigar: vec![Op::new(Kind::Match, 50)], score: 100, n_mismatch: 0, n_gap: 0, @@ -2010,7 +1983,7 @@ mod tests { genome_end: 52, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(50)], + cigar: vec![Op::new(Kind::Match, 50)], score: 98, n_mismatch: 1, n_gap: 0, @@ -2025,7 +1998,7 @@ mod tests { genome_end: 54, is_reverse: true, exons: vec![], - cigar: vec![CigarOp::Match(50)], + cigar: vec![Op::new(Kind::Match, 50)], score: 96, n_mismatch: 2, n_gap: 0, @@ -2063,6 +2036,7 @@ mod tests { #[test] fn test_xs_tag_spliced() { + use cigar::op::{Kind, Op}; let genome = make_test_genome(); let transcript = Transcript { @@ -2072,9 +2046,9 @@ mod tests { is_reverse: false, exons: vec![], cigar: vec![ - CigarOp::Match(25), - CigarOp::RefSkip(100), - CigarOp::Match(25), + Op::new(Kind::Match, 25), + Op::new(Kind::Skip, 100), + Op::new(Kind::Match, 25), ], score: 50, n_mismatch: 0, @@ -2111,6 +2085,7 @@ mod tests { #[test] fn test_xs_tag_unspliced() { + use cigar::op::{Kind, Op}; let genome = make_test_genome(); let transcript = Transcript { @@ -2119,7 +2094,7 @@ mod tests { genome_end: 50, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(50)], + cigar: vec![Op::new(Kind::Match, 50)], score: 100, n_mismatch: 0, n_gap: 0, @@ -2155,6 +2130,7 @@ mod tests { #[test] fn test_xs_tag_reverse_strand() { + use cigar::op::{Kind, Op}; let genome = make_test_genome(); let transcript = Transcript { @@ -2164,9 +2140,9 @@ mod tests { is_reverse: false, exons: vec![], cigar: vec![ - CigarOp::Match(25), - CigarOp::RefSkip(100), - CigarOp::Match(25), + Op::new(Kind::Match, 25), + Op::new(Kind::Skip, 100), + Op::new(Kind::Match, 25), ], score: 50, n_mismatch: 0, @@ -2203,6 +2179,7 @@ mod tests { #[test] fn test_xs_tag_conflicting_motifs() { + use cigar::op::{Kind, Op}; let genome = make_test_genome(); let transcript = Transcript { @@ -2212,11 +2189,11 @@ mod tests { is_reverse: false, exons: vec![], cigar: vec![ - CigarOp::Match(25), - CigarOp::RefSkip(100), - CigarOp::Match(25), - CigarOp::RefSkip(100), - CigarOp::Match(25), + Op::new(Kind::Match, 25), + Op::new(Kind::Skip, 100), + Op::new(Kind::Match, 25), + Op::new(Kind::Skip, 100), + Op::new(Kind::Match, 25), ], score: 50, n_mismatch: 0, @@ -2253,6 +2230,7 @@ mod tests { #[test] fn test_xs_not_emitted_when_disabled() { + use cigar::op::{Kind, Op}; let genome = make_test_genome(); let transcript = Transcript { @@ -2262,9 +2240,9 @@ mod tests { is_reverse: false, exons: vec![], cigar: vec![ - CigarOp::Match(25), - CigarOp::RefSkip(100), - CigarOp::Match(25), + Op::new(Kind::Match, 25), + Op::new(Kind::Skip, 100), + Op::new(Kind::Match, 25), ], score: 50, n_mismatch: 0, @@ -2301,6 +2279,7 @@ mod tests { #[test] fn test_out_sam_mult_nmax() { + use cigar::op::{Kind, Op}; let genome = make_test_genome(); let params = Parameters::parse_from(vec![ "rustar-aligner", @@ -2317,7 +2296,7 @@ mod tests { genome_end: (i + 50) as u64, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(50)], + cigar: vec![Op::new(Kind::Match, 50)], score: 100 - i, n_mismatch: 0, n_gap: 0, @@ -2377,6 +2356,7 @@ mod tests { #[test] fn test_build_jm_tag_basic() { + use cigar::op::{Kind, Op}; let transcript = Transcript { chr_idx: 0, genome_start: 0, @@ -2384,9 +2364,9 @@ mod tests { is_reverse: false, exons: vec![], cigar: vec![ - CigarOp::Match(25), - CigarOp::RefSkip(100), - CigarOp::Match(25), + Op::new(Kind::Match, 25), + Op::new(Kind::Skip, 100), + Op::new(Kind::Match, 25), ], score: 50, n_mismatch: 0, @@ -2405,6 +2385,7 @@ mod tests { #[test] fn test_build_jm_tag_annotated() { + use cigar::op::{Kind, Op}; let transcript = Transcript { chr_idx: 0, genome_start: 0, @@ -2412,9 +2393,9 @@ mod tests { is_reverse: false, exons: vec![], cigar: vec![ - CigarOp::Match(25), - CigarOp::RefSkip(100), - CigarOp::Match(25), + Op::new(Kind::Match, 25), + Op::new(Kind::Skip, 100), + Op::new(Kind::Match, 25), ], score: 50, n_mismatch: 0, @@ -2433,13 +2414,14 @@ mod tests { #[test] fn test_build_jm_tag_empty() { + use cigar::op::{Kind, Op}; let transcript = Transcript { chr_idx: 0, genome_start: 0, genome_end: 50, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(50)], + cigar: vec![Op::new(Kind::Match, 50)], score: 50, n_mismatch: 0, n_gap: 0, @@ -2454,6 +2436,7 @@ mod tests { #[test] fn test_build_jm_tag_multiple_junctions() { + use cigar::op::{Kind, Op}; let transcript = Transcript { chr_idx: 0, genome_start: 0, @@ -2461,11 +2444,11 @@ mod tests { is_reverse: false, exons: vec![], cigar: vec![ - CigarOp::Match(25), - CigarOp::RefSkip(100), - CigarOp::Match(25), - CigarOp::RefSkip(100), - CigarOp::Match(25), + Op::new(Kind::Match, 25), + Op::new(Kind::Skip, 100), + Op::new(Kind::Match, 25), + Op::new(Kind::Skip, 100), + Op::new(Kind::Match, 25), ], score: 50, n_mismatch: 0, @@ -2484,6 +2467,7 @@ mod tests { #[test] fn test_build_ji_tag_basic() { + use cigar::op::{Kind, Op}; let transcript = Transcript { chr_idx: 0, genome_start: 100, @@ -2491,9 +2475,9 @@ mod tests { is_reverse: false, exons: vec![], cigar: vec![ - CigarOp::Match(25), - CigarOp::RefSkip(200), - CigarOp::Match(25), + Op::new(Kind::Match, 25), + Op::new(Kind::Skip, 200), + Op::new(Kind::Match, 25), ], score: 50, n_mismatch: 0, @@ -2513,13 +2497,14 @@ mod tests { #[test] fn test_build_ji_tag_empty() { + use cigar::op::{Kind, Op}; let transcript = Transcript { chr_idx: 0, genome_start: 0, genome_end: 50, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(50)], + cigar: vec![Op::new(Kind::Match, 50)], score: 50, n_mismatch: 0, n_gap: 0, @@ -2534,6 +2519,7 @@ mod tests { #[test] fn test_build_md_tag_perfect_match() { + use cigar::op::{Kind, Op}; // Genome: ACGTACGT (A=0,C=1,G=2,T=3) let genome = make_test_genome(); let transcript = Transcript { @@ -2542,7 +2528,7 @@ mod tests { genome_end: 4, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(4)], + cigar: vec![Op::new(Kind::Match, 4)], score: 4, n_mismatch: 0, n_gap: 0, @@ -2560,6 +2546,7 @@ mod tests { #[test] fn test_build_md_tag_mismatches() { + use cigar::op::{Kind, Op}; // Genome: ACGTACGT let genome = make_test_genome(); let transcript = Transcript { @@ -2568,7 +2555,7 @@ mod tests { genome_end: 4, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(4)], + cigar: vec![Op::new(Kind::Match, 4)], score: 2, n_mismatch: 2, n_gap: 0, @@ -2587,6 +2574,7 @@ mod tests { #[test] fn test_build_md_tag_deletion() { + use cigar::op::{Kind, Op}; // Genome: ACGTACGT let genome = make_test_genome(); let transcript = Transcript { @@ -2595,7 +2583,11 @@ mod tests { genome_end: 6, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(2), CigarOp::Del(2), CigarOp::Match(2)], + cigar: vec![ + Op::new(Kind::Match, 2), + Op::new(Kind::Deletion, 2), + Op::new(Kind::Match, 2), + ], score: 4, n_mismatch: 0, n_gap: 1, @@ -2613,6 +2605,7 @@ mod tests { #[test] fn test_build_md_tag_insertion() { + use cigar::op::{Kind, Op}; // Genome: ACGTACGT let genome = make_test_genome(); let transcript = Transcript { @@ -2621,7 +2614,11 @@ mod tests { genome_end: 4, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(2), CigarOp::Ins(2), CigarOp::Match(2)], + cigar: vec![ + Op::new(Kind::Match, 2), + Op::new(Kind::Insertion, 2), + Op::new(Kind::Match, 2), + ], score: 4, n_mismatch: 0, n_gap: 1, @@ -2639,6 +2636,7 @@ mod tests { #[test] fn test_build_md_tag_soft_clip() { + use cigar::op::{Kind, Op}; // Genome: ACGTACGT let genome = make_test_genome(); let transcript = Transcript { @@ -2648,9 +2646,9 @@ mod tests { is_reverse: false, exons: vec![], cigar: vec![ - CigarOp::SoftClip(2), - CigarOp::Match(4), - CigarOp::SoftClip(2), + Op::new(Kind::SoftClip, 2), + Op::new(Kind::Match, 4), + Op::new(Kind::SoftClip, 2), ], score: 4, n_mismatch: 0, @@ -2669,6 +2667,7 @@ mod tests { #[test] fn test_tags_jm_ji_md_in_record() { + use cigar::op::{Kind, Op}; // Verify tags appear in a full transcript_to_record call let genome = make_test_genome(); @@ -2678,7 +2677,7 @@ mod tests { genome_end: 4, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(4)], + cigar: vec![Op::new(Kind::Match, 4)], score: 4, n_mismatch: 0, n_gap: 0, @@ -2718,6 +2717,7 @@ mod tests { #[test] fn test_build_paired_mate_record_cross_strand() { use crate::align::transcript::Exon; + use cigar::op::{Kind, Op}; let genome = make_test_genome(); @@ -2734,7 +2734,7 @@ mod tests { read_end: 4, i_frag: 0, }], - cigar: vec![CigarOp::Match(4)], + cigar: vec![Op::new(Kind::Match, 4)], score: 100, n_mismatch: 0, n_gap: 0, @@ -2757,7 +2757,7 @@ mod tests { read_end: 3, i_frag: 0, }], - cigar: vec![CigarOp::Match(3)], + cigar: vec![Op::new(Kind::Match, 3)], score: 90, n_mismatch: 0, n_gap: 0, @@ -2820,6 +2820,7 @@ mod tests { #[test] fn test_build_paired_mate_record_per_mate_tags() { use crate::align::transcript::Exon; + use cigar::op::{Kind, Op}; let genome = make_test_genome(); @@ -2836,7 +2837,7 @@ mod tests { read_end: 4, i_frag: 0, }], - cigar: vec![CigarOp::Match(4)], + cigar: vec![Op::new(Kind::Match, 4)], score: 100, n_mismatch: 0, n_gap: 0, @@ -2859,7 +2860,11 @@ mod tests { read_end: 3, i_frag: 0, }], - cigar: vec![CigarOp::Match(2), CigarOp::Del(1), CigarOp::Match(1)], + cigar: vec![ + Op::new(Kind::Match, 2), + Op::new(Kind::Deletion, 1), + Op::new(Kind::Match, 1), + ], score: 80, n_mismatch: 2, n_gap: 1, @@ -2938,6 +2943,7 @@ mod tests { #[test] fn test_build_paired_mate_record_both_forward() { use crate::align::transcript::Exon; + use cigar::op::{Kind, Op}; let genome = make_test_genome(); @@ -2954,7 +2960,7 @@ mod tests { read_end: 4, i_frag: 0, }], - cigar: vec![CigarOp::Match(4)], + cigar: vec![Op::new(Kind::Match, 4)], score: 100, n_mismatch: 0, n_gap: 0, @@ -2976,7 +2982,7 @@ mod tests { read_end: 3, i_frag: 0, }], - cigar: vec![CigarOp::Match(3)], + cigar: vec![Op::new(Kind::Match, 3)], score: 90, n_mismatch: 0, n_gap: 0, @@ -3033,6 +3039,7 @@ mod tests { #[test] fn test_out_sam_attributes_standard() { + use cigar::op::{Kind, Op}; // Default (Standard) → NH, HI, AS, NM present; XS, jM, jI, MD absent let genome = make_test_genome(); @@ -3042,7 +3049,7 @@ mod tests { genome_end: 4, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(4)], + cigar: vec![Op::new(Kind::Match, 4)], score: 100, n_mismatch: 1, n_gap: 0, @@ -3108,6 +3115,7 @@ mod tests { #[test] fn test_out_sam_attributes_none() { + use cigar::op::{Kind, Op}; // None → no optional tags at all let genome = make_test_genome(); @@ -3117,7 +3125,7 @@ mod tests { genome_end: 4, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(4)], + cigar: vec![Op::new(Kind::Match, 4)], score: 100, n_mismatch: 1, n_gap: 0, @@ -3182,6 +3190,7 @@ mod tests { #[test] fn test_out_sam_attributes_explicit() { + use cigar::op::{Kind, Op}; // Explicit ["NH", "MD"] → only NH and MD present let genome = make_test_genome(); @@ -3191,7 +3200,7 @@ mod tests { genome_end: 4, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(4)], + cigar: vec![Op::new(Kind::Match, 4)], score: 100, n_mismatch: 0, n_gap: 0, @@ -3317,6 +3326,7 @@ mod tests { #[test] fn test_nm_vs_nm_mismatch_difference() { + use cigar::op::{Kind, Op}; // Verify NM (edit distance) ≠ nM (mismatches only) when indels present let genome = make_test_genome(); @@ -3327,11 +3337,11 @@ mod tests { is_reverse: false, exons: vec![], cigar: vec![ - CigarOp::Match(20), - CigarOp::Ins(5), - CigarOp::Match(10), - CigarOp::Del(3), - CigarOp::Match(20), + Op::new(Kind::Match, 20), + Op::new(Kind::Insertion, 5), + Op::new(Kind::Match, 10), + Op::new(Kind::Deletion, 3), + Op::new(Kind::Match, 20), ], score: 80, n_mismatch: 2, @@ -3376,6 +3386,7 @@ mod tests { #[test] fn test_build_half_mapped_flags() { use crate::align::transcript::Exon; + use cigar::op::{Kind, Op}; let genome = make_test_genome(); let params = @@ -3393,7 +3404,7 @@ mod tests { read_end: 4, i_frag: 0, }], - cigar: vec![CigarOp::Match(4)], + cigar: vec![Op::new(Kind::Match, 4)], score: 100, n_mismatch: 0, n_gap: 0, @@ -3445,6 +3456,7 @@ mod tests { #[test] fn test_build_half_mapped_rnext_pnext() { use crate::align::transcript::Exon; + use cigar::op::{Kind, Op}; let genome = make_test_genome(); let params = @@ -3462,7 +3474,7 @@ mod tests { read_end: 4, i_frag: 0, }], - cigar: vec![CigarOp::Match(4)], + cigar: vec![Op::new(Kind::Match, 4)], score: 100, n_mismatch: 0, n_gap: 0, @@ -3527,6 +3539,7 @@ mod tests { #[test] fn test_build_half_mapped_mate_order() { use crate::align::transcript::Exon; + use cigar::op::{Kind, Op}; let genome = make_test_genome(); let params = @@ -3544,7 +3557,7 @@ mod tests { read_end: 4, i_frag: 0, }], - cigar: vec![CigarOp::Match(4)], + cigar: vec![Op::new(Kind::Match, 4)], score: 100, n_mismatch: 0, n_gap: 0, diff --git a/src/lib.rs b/src/lib.rs index 26e788e..cda64a0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,7 +8,6 @@ clippy::cast_sign_loss, clippy::doc_markdown, clippy::items_after_statements, - clippy::match_same_arms, clippy::missing_errors_doc, clippy::missing_panics_doc, clippy::must_use_candidate, @@ -41,6 +40,7 @@ pub mod quant; pub mod stats; use log::info; +use noodles::sam::alignment::record::cigar; use crate::params::{Parameters, RunMode}; @@ -809,20 +809,21 @@ fn extract_junction_keys( index: &crate::index::GenomeIndex, ) -> Vec { use crate::align::score::AlignmentScorer; - use crate::align::transcript::CigarOp; + use cigar::op::Kind; let scorer = AlignmentScorer::from_params_minimal(); let mut keys = Vec::new(); let mut genome_pos = transcript.genome_start; for op in &transcript.cigar { - match op { - CigarOp::RefSkip(len) => { - let intron_len = *len; + match op.kind() { + Kind::Skip => { + let intron_len = op.len(); let intron_start = genome_pos + 1; let intron_end = genome_pos + intron_len as u64; - let motif = scorer.detect_splice_motif(genome_pos, intron_len, &index.genome); + let motif = + scorer.detect_splice_motif(genome_pos, intron_len as u32, &index.genome); let strand = match motif.implied_strand() { Some('+') => 1u8, Some('-') => 2u8, @@ -840,13 +841,10 @@ fn extract_junction_keys( genome_pos += intron_len as u64; } - CigarOp::Match(len) | CigarOp::Equal(len) | CigarOp::Diff(len) => { - genome_pos += *len as u64; + Kind::Match | Kind::SequenceMatch | Kind::SequenceMismatch | Kind::Deletion => { + genome_pos += op.len() as u64; } - CigarOp::Del(len) => { - genome_pos += *len as u64; - } - CigarOp::Ins(_) | CigarOp::SoftClip(_) | CigarOp::HardClip(_) => {} + Kind::Insertion | Kind::SoftClip | Kind::HardClip | Kind::Pad => {} } } @@ -1846,7 +1844,7 @@ fn record_transcript_junctions( is_unique: bool, ) { use crate::align::score::AlignmentScorer; - use crate::align::transcript::CigarOp; + use cigar::op::Kind; // First pass: compute exon segment lengths (query-consuming bases between N operations) // An "exon segment" is the query bases on each side of a splice junction. @@ -1854,20 +1852,17 @@ fn record_transcript_junctions( let mut current_exon_len = 0u32; for op in &transcript.cigar { - match op { - CigarOp::Match(len) | CigarOp::Equal(len) | CigarOp::Diff(len) => { - current_exon_len += *len; - } - CigarOp::Ins(len) => { - current_exon_len += *len; + match op.kind() { + Kind::Match | Kind::SequenceMatch | Kind::SequenceMismatch | Kind::Insertion => { + current_exon_len += op.len() as u32; } - CigarOp::RefSkip(_) => { + Kind::Skip => { exon_lengths.push(current_exon_len); current_exon_len = 0; } // Soft clips, deletions, hard clips do not contribute to overhang // STAR counts only matched/inserted bases (not soft-clipped bases) - CigarOp::SoftClip(_) | CigarOp::Del(_) | CigarOp::HardClip(_) => {} + Kind::Deletion | Kind::SoftClip | Kind::HardClip | Kind::Pad => {} } } exon_lengths.push(current_exon_len); // Final exon segment @@ -1879,15 +1874,16 @@ fn record_transcript_junctions( let scorer = AlignmentScorer::from_params_minimal(); for op in &transcript.cigar { - match op { - CigarOp::RefSkip(len) => { + match op.kind() { + Kind::Skip => { // This is a splice junction - let intron_len = *len; + let intron_len = op.len(); let intron_start = genome_pos + 1; // 1-based, first intronic base let intron_end = genome_pos + intron_len as u64; // 1-based, last intronic base // Detect splice motif - let motif = scorer.detect_splice_motif(genome_pos, intron_len, &index.genome); + let motif = + scorer.detect_splice_motif(genome_pos, intron_len as u32, &index.genome); // Compute overhang: min(left_exon_length, right_exon_length) let left_exon = exon_lengths[junction_idx]; @@ -1923,14 +1919,10 @@ fn record_transcript_junctions( genome_pos += intron_len as u64; junction_idx += 1; } - CigarOp::Match(len) | CigarOp::Equal(len) | CigarOp::Diff(len) => { - genome_pos += *len as u64; - } - CigarOp::Ins(_) => {} - CigarOp::Del(len) => { - genome_pos += *len as u64; + Kind::Match | Kind::SequenceMatch | Kind::SequenceMismatch | Kind::Deletion => { + genome_pos += op.len() as u64; } - CigarOp::SoftClip(_) | CigarOp::HardClip(_) => {} + Kind::Insertion | Kind::SoftClip | Kind::HardClip | Kind::Pad => {} } } } diff --git a/src/mapq.rs b/src/mapq.rs index c1c8c02..d93e2e4 100644 --- a/src/mapq.rs +++ b/src/mapq.rs @@ -15,11 +15,10 @@ /// MAPQ score (0-255) pub fn calculate_mapq(n_alignments: usize, mapq_unique: u8) -> u8 { match n_alignments { - 0 => 0, 1 => mapq_unique, 2 => 3, 3 | 4 => 1, - _ => 0, // >= 5 + _ => 0, // 0 or >= 5 } } diff --git a/src/quant/transcriptome.rs b/src/quant/transcriptome.rs index bd2d83a..27dfc1f 100644 --- a/src/quant/transcriptome.rs +++ b/src/quant/transcriptome.rs @@ -10,15 +10,15 @@ //! only substantive divergence that rustar-aligner builds the transcript tables on //! the fly from the input GTF instead of loading persisted //! `transcriptInfo.tab`/`exonInfo.tab` files. -use std::collections::HashMap; -use std::io::Write as _; -use std::path::Path; - -use crate::align::transcript::{CigarOp, Exon, Transcript}; +use crate::align::transcript::{Exon, Transcript}; use crate::error::Error; use crate::genome::Genome; use crate::junction::gtf::GtfRecord; use crate::params::Parameters; +use noodles::sam::alignment::record::cigar; +use std::collections::HashMap; +use std::io::Write as _; +use std::path::Path; /// GTF attribute names the transcriptome index reads. const GTF_ATTR_TRANSCRIPT_ID: &str = "transcript_id"; @@ -265,9 +265,8 @@ impl TranscriptomeIndex { let total_len = ex_len_cum; let strand_u8 = match strand_char { - '+' => 1u8, '-' => 2u8, - _ => 1u8, // unknown → treat as forward (STAR default) + _ => 1u8, // forward or unknown → treat as forward (STAR default) }; let gene_id = first.attributes.get(gene_tag).cloned().unwrap_or_default(); @@ -1102,10 +1101,10 @@ fn align_to_one_transcript( // Build projected CIGAR: drop N operations (splices collapse in t-space); // for reverse-strand transcripts, reverse the resulting op sequence. - let mut proj_cigar: Vec = align + let mut proj_cigar: Vec = align .cigar .iter() - .filter(|op| !matches!(op, CigarOp::RefSkip(_))) + .filter(|op| op.kind() != cigar::op::Kind::Skip) .copied() .collect(); if tr_strand == 2 { @@ -1172,10 +1171,11 @@ pub fn filter_and_project( } fn has_soft_clip(align: &Transcript) -> bool { + use cigar::op::Kind; align .cigar .iter() - .any(|op| matches!(op, CigarOp::SoftClip(n) if *n > 0)) + .any(|op| op.kind() == Kind::SoftClip && !op.is_empty()) } /// Extend the 5'/3' soft-clips of `align` back into matched bases, counting @@ -1188,7 +1188,7 @@ fn extend_softclips( params: &Parameters, ) -> Option { // Determine left / right clip sizes from the CIGAR. - let (left_clip, right_clip) = align.count_soft_clips(); + let [left_clip, right_clip] = align.count_soft_clips(); let mut n_mm_extra: u32 = 0; @@ -1198,7 +1198,7 @@ fn extend_softclips( if left_clip > 0 && let Some(first) = align.exons.first() { - for b in 1..=left_clip as usize { + for b in 1..=left_clip { if b > first.read_start { break; } @@ -1224,7 +1224,7 @@ fn extend_softclips( if right_clip > 0 && let Some(last) = align.exons.last() { - for b in 0..right_clip as usize { + for b in 0..right_clip { let r_idx = last.read_end + b; let g_idx = (last.genome_end as usize) + b; if r_idx >= read_bases_align_orientation.len() || g_idx >= genome.sequence.len() { @@ -1263,7 +1263,7 @@ fn extend_softclips( if right_clip > 0 && let Some(last) = ext.exons.last_mut() { - last.read_end += right_clip as usize; + last.read_end += right_clip; last.genome_end += right_clip as u64; } @@ -1281,17 +1281,18 @@ fn extend_softclips( /// adjacent `Match` ops. Interior soft-clips (rare — only appear in chimeric /// contexts) are left alone. fn rebuild_cigar_without_softclips( - cigar: &[CigarOp], - left_clip: u32, - right_clip: u32, -) -> Vec { - let mut out: Vec = Vec::with_capacity(cigar.len()); + cigar: &[cigar::Op], + left_clip: usize, + right_clip: usize, +) -> Vec { + use cigar::op::{Kind, Op}; + let mut out: Vec = Vec::with_capacity(cigar.len()); let mut start_idx = 0; let mut end_idx = cigar.len(); - if left_clip > 0 && matches!(cigar.first(), Some(CigarOp::SoftClip(_))) { + if left_clip > 0 && cigar.first().is_some_and(|op| op.kind() == Kind::SoftClip) { start_idx = 1; } - if right_clip > 0 && end_idx > start_idx && matches!(cigar[end_idx - 1], CigarOp::SoftClip(_)) { + if right_clip > 0 && end_idx > start_idx && cigar[end_idx - 1].kind() == Kind::SoftClip { end_idx -= 1; } @@ -1299,23 +1300,25 @@ fn rebuild_cigar_without_softclips( for (i, op) in body.iter().enumerate() { if i == 0 && left_clip > 0 { // Fold left_clip into the first op if it's match-like. - match op { - CigarOp::Match(n) => out.push(CigarOp::Match(n + left_clip)), - CigarOp::Equal(n) => out.push(CigarOp::Equal(n + left_clip)), + match op.kind() { + Kind::Match => out.push(Op::new(Kind::Match, op.len() + left_clip)), + Kind::SequenceMatch => out.push(Op::new(Kind::SequenceMatch, op.len() + left_clip)), _ => { // Extension landed on a non-match op (shouldn't normally // happen). Emit as Match. - out.push(CigarOp::Match(left_clip)); + out.push(Op::new(Kind::Match, left_clip)); out.push(*op); } } } else if i + 1 == body.len() && right_clip > 0 { - match op { - CigarOp::Match(n) => out.push(CigarOp::Match(n + right_clip)), - CigarOp::Equal(n) => out.push(CigarOp::Equal(n + right_clip)), + match op.kind() { + Kind::Match => out.push(Op::new(Kind::Match, op.len() + right_clip)), + Kind::SequenceMatch => { + out.push(Op::new(Kind::SequenceMatch, op.len() + right_clip)); + } _ => { out.push(*op); - out.push(CigarOp::Match(right_clip)); + out.push(Op::new(Kind::Match, right_clip)); } } } else { @@ -1382,6 +1385,8 @@ fn is_splice_boundary_before(align: &Transcript, iab: usize) -> bool { #[cfg(test)] mod tests { + use noodles::sam::alignment::record::cigar; + use super::*; fn make_genome() -> Genome { @@ -1952,7 +1957,7 @@ mod tests { chr_idx: usize, is_reverse: bool, exons: Vec<(u64, u64, usize, usize)>, - cigar: Vec, + cigar: Vec, ) -> Transcript { let proj_exons: Vec = exons .into_iter() @@ -1985,13 +1990,19 @@ mod tests { #[test] fn project_single_exon_align_into_single_exon_transcript() { + use cigar::op::{Kind, Op}; let genome = make_genome(); // Transcript: chr1 [100, 200) forward. let gtf = vec![make_exon("chr1", 101, 200, '+', "G1", "T1")]; let idx = TranscriptomeIndex::from_gtf_exons(>f, &genome).unwrap(); // Align fully inside exon: genome [110, 150), read [0, 40). - let align = make_align(0, false, vec![(110, 150, 0, 40)], vec![CigarOp::Match(40)]); + let align = make_align( + 0, + false, + vec![(110, 150, 0, 40)], + vec![Op::new(Kind::Match, 40)], + ); let results = align_to_transcripts(&align, &idx, 40); assert_eq!(results.len(), 1); let r = &results[0]; @@ -2004,12 +2015,13 @@ mod tests { assert_eq!(r.exons[0].read_start, 0); assert_eq!(r.exons[0].read_end, 40); // CIGAR must have no N - assert!(r.cigar.iter().all(|op| !matches!(op, CigarOp::RefSkip(_)))); + assert!(r.cigar.iter().all(|op| op.kind() != Kind::Skip)); assert_eq!(r.cigar.len(), 1); } #[test] fn project_two_exon_align_matching_junction() { + use cigar::op::{Kind, Op}; let genome = make_genome(); // Transcript T1: chr1 [100, 200) + [300, 400) forward. tr_length = 200. let gtf = vec![ @@ -2025,9 +2037,9 @@ mod tests { false, vec![(150, 200, 0, 50), (300, 350, 50, 100)], vec![ - CigarOp::Match(50), - CigarOp::RefSkip(100), - CigarOp::Match(50), + Op::new(Kind::Match, 50), + Op::new(Kind::Skip, 100), + Op::new(Kind::Match, 50), ], ); let results = align_to_transcripts(&align, &idx, 100); @@ -2042,12 +2054,13 @@ mod tests { assert_eq!(r.exons[1].genome_start, 100); assert_eq!(r.exons[1].genome_end, 150); // CIGAR: no N - assert!(r.cigar.iter().all(|op| !matches!(op, CigarOp::RefSkip(_)))); + assert!(r.cigar.iter().all(|op| op.kind() != Kind::Skip)); assert_eq!(r.cigar.len(), 2); } #[test] fn project_mismatched_junction_fails() { + use cigar::op::{Kind, Op}; let genome = make_genome(); let gtf = vec![ make_exon("chr1", 101, 200, '+', "G1", "T1"), @@ -2061,9 +2074,9 @@ mod tests { false, vec![(150, 195, 0, 45), (305, 350, 45, 90)], vec![ - CigarOp::Match(45), - CigarOp::RefSkip(110), - CigarOp::Match(45), + Op::new(Kind::Match, 45), + Op::new(Kind::Skip, 110), + Op::new(Kind::Match, 45), ], ); let results = align_to_transcripts(&align, &idx, 90); @@ -2072,13 +2085,19 @@ mod tests { #[test] fn project_onto_reverse_strand_transcript() { + use cigar::op::{Kind, Op}; let genome = make_genome(); // Reverse transcript: chr1 [100, 200) - strand. tr_length = 100. let gtf = vec![make_exon("chr1", 101, 200, '-', "G1", "T1")]; let idx = TranscriptomeIndex::from_gtf_exons(>f, &genome).unwrap(); // Align forward on genome [120, 160), read [0, 40). - let align = make_align(0, false, vec![(120, 160, 0, 40)], vec![CigarOp::Match(40)]); + let align = make_align( + 0, + false, + vec![(120, 160, 0, 40)], + vec![Op::new(Kind::Match, 40)], + ); let results = align_to_transcripts(&align, &idx, 40); assert_eq!(results.len(), 1); let r = &results[0]; @@ -2096,6 +2115,7 @@ mod tests { #[test] fn project_multi_exon_align_onto_longer_transcript() { + use cigar::op::{Kind, Op}; let genome = make_genome(); // 3-exon transcript: [100,200) + [300,400) + [500,600). tr_length = 300. let gtf = vec![ @@ -2111,9 +2131,9 @@ mod tests { false, vec![(350, 400, 0, 50), (500, 550, 50, 100)], vec![ - CigarOp::Match(50), - CigarOp::RefSkip(100), - CigarOp::Match(50), + Op::new(Kind::Match, 50), + Op::new(Kind::Skip, 100), + Op::new(Kind::Match, 50), ], ); let results = align_to_transcripts(&align, &idx, 100); @@ -2130,6 +2150,7 @@ mod tests { #[test] fn project_past_transcript_end_fails() { + use cigar::op::{Kind, Op}; let genome = make_genome(); let gtf = vec![make_exon("chr1", 101, 200, '+', "G1", "T1")]; let idx = TranscriptomeIndex::from_gtf_exons(>f, &genome).unwrap(); @@ -2139,7 +2160,7 @@ mod tests { 0, false, vec![(150, 250, 0, 100)], - vec![CigarOp::Match(100)], + vec![Op::new(Kind::Match, 100)], ); let results = align_to_transcripts(&align, &idx, 100); assert_eq!(results.len(), 0); @@ -2147,17 +2168,24 @@ mod tests { #[test] fn project_before_all_transcripts_returns_empty() { + use cigar::op::{Kind, Op}; let genome = make_genome(); let gtf = vec![make_exon("chr1", 501, 600, '+', "G1", "T1")]; let idx = TranscriptomeIndex::from_gtf_exons(>f, &genome).unwrap(); - let align = make_align(0, false, vec![(100, 150, 0, 50)], vec![CigarOp::Match(50)]); + let align = make_align( + 0, + false, + vec![(100, 150, 0, 50)], + vec![Op::new(Kind::Match, 50)], + ); let results = align_to_transcripts(&align, &idx, 50); assert_eq!(results.len(), 0); } #[test] fn project_onto_multiple_overlapping_transcripts() { + use cigar::op::{Kind, Op}; let genome = make_genome(); // Two transcripts both containing the align: // T1: [100, 400) single exon @@ -2168,7 +2196,12 @@ mod tests { ]; let idx = TranscriptomeIndex::from_gtf_exons(>f, &genome).unwrap(); - let align = make_align(0, false, vec![(200, 250, 0, 50)], vec![CigarOp::Match(50)]); + let align = make_align( + 0, + false, + vec![(200, 250, 0, 50)], + vec![Op::new(Kind::Match, 50)], + ); let results = align_to_transcripts(&align, &idx, 50); assert_eq!(results.len(), 2); } @@ -2210,10 +2243,16 @@ mod tests { #[test] fn filter_default_rejects_indels() { + use cigar::op::{Kind, Op}; let genome = make_genome(); let gtf = vec![make_exon("chr1", 101, 300, '+', "G1", "T1")]; let idx = TranscriptomeIndex::from_gtf_exons(>f, &genome).unwrap(); - let mut align = make_align(0, false, vec![(110, 150, 0, 40)], vec![CigarOp::Match(40)]); + let mut align = make_align( + 0, + false, + vec![(110, 150, 0, 40)], + vec![Op::new(Kind::Match, 40)], + ); align.n_gap = 1; // simulate insertion/deletion let params = default_params(); let read = vec![0u8; 40]; @@ -2231,10 +2270,16 @@ mod tests { #[test] fn filter_keeps_indels_when_allowed() { + use cigar::op::{Kind, Op}; let genome = make_genome(); let gtf = vec![make_exon("chr1", 101, 300, '+', "G1", "T1")]; let idx = TranscriptomeIndex::from_gtf_exons(>f, &genome).unwrap(); - let mut align = make_align(0, false, vec![(110, 150, 0, 40)], vec![CigarOp::Match(40)]); + let mut align = make_align( + 0, + false, + vec![(110, 150, 0, 40)], + vec![Op::new(Kind::Match, 40)], + ); align.n_gap = 1; let params = default_params(); let read = vec![0u8; 40]; @@ -2252,6 +2297,7 @@ mod tests { #[test] fn filter_extends_left_softclip_with_zero_mismatches() { + use cigar::op::{Kind, Op}; // Build a custom genome with known content so we can construct a // read whose soft-clipped bases match the adjacent genome. let mut seq = vec![4u8; 1000]; @@ -2276,7 +2322,7 @@ mod tests { 0, false, vec![(104, 144, 4, 44)], - vec![CigarOp::SoftClip(4), CigarOp::Match(40)], + vec![Op::new(Kind::SoftClip, 4), Op::new(Kind::Match, 40)], ); // Read is 44 bases of A (0s) — matches all of genome [100, 144). let read = vec![0u8; 44]; @@ -2295,14 +2341,15 @@ mod tests { let r = &results[0]; // CIGAR should now be a single 44M (4S folded into leading M). assert_eq!(r.cigar.len(), 1); - match r.cigar[0] { - CigarOp::Match(n) => assert_eq!(n, 44), + match r.cigar[0].kind() { + Kind::Match => assert_eq!(r.cigar[0].len(), 44), _ => panic!("expected Match(44)"), } } #[test] fn filter_extends_softclip_too_many_mismatches_rejects() { + use cigar::op::{Kind, Op}; // Left clip is 4 bases, all mismatches. n_mismatch = 0 to start. // With very tight out_filter_mismatch_nmax, the alignment is rejected. let mut seq = vec![4u8; 1000]; @@ -2325,7 +2372,7 @@ mod tests { 0, false, vec![(104, 144, 4, 44)], - vec![CigarOp::SoftClip(4), CigarOp::Match(40)], + vec![Op::new(Kind::SoftClip, 4), Op::new(Kind::Match, 40)], ); // Read clip bases are all T (3) — mismatch against genome A (0) let mut read = vec![0u8; 44]; @@ -2350,6 +2397,7 @@ mod tests { #[test] fn filter_mode_single_end_keeps_softclip_as_is() { + use cigar::op::{Kind, Op}; let genome = make_genome(); let gtf = vec![make_exon("chr1", 101, 300, '+', "G1", "T1")]; let idx = TranscriptomeIndex::from_gtf_exons(>f, &genome).unwrap(); @@ -2358,7 +2406,7 @@ mod tests { 0, false, vec![(110, 150, 4, 44)], - vec![CigarOp::SoftClip(4), CigarOp::Match(40)], + vec![Op::new(Kind::SoftClip, 4), Op::new(Kind::Match, 40)], ); let read = vec![0u8; 44]; let params = default_params(); @@ -2379,7 +2427,7 @@ mod tests { results[0] .cigar .iter() - .any(|op| matches!(op, CigarOp::SoftClip(_))) + .any(|op| op.kind() == Kind::SoftClip) ); } diff --git a/src/stats.rs b/src/stats.rs index 620c798..01c2394 100644 --- a/src/stats.rs +++ b/src/stats.rs @@ -1,10 +1,11 @@ -/// Alignment statistics tracking and reporting +//! Alignment statistics tracking and reporting use log::info; use std::path::Path; use std::sync::atomic::{AtomicU64, Ordering}; -use crate::align::transcript::{CigarOp, Transcript}; +use crate::align::transcript::Transcript; use crate::junction::encode_motif; +use noodles::sam::alignment::record::cigar; /// Reason a read could not be mapped #[derive(Debug, Clone, Copy, PartialEq, Eq)] @@ -144,23 +145,25 @@ impl AlignmentStats { /// Walks the CIGAR to extract mapped bases, ins/del counts, and splice motif counts. /// Only call this for unique mappers (n_alignments == 1). pub fn record_transcript_stats(&self, transcript: &Transcript) { + use cigar::op::Kind; // Walk CIGAR for mapped bases, ins/del for op in &transcript.cigar { - match op { - CigarOp::Match(len) | CigarOp::Equal(len) | CigarOp::Diff(len) => { - self.mapped_bases.fetch_add(*len as u64, Ordering::Relaxed); + match op.kind() { + Kind::Match | Kind::SequenceMatch | Kind::SequenceMismatch => { + self.mapped_bases + .fetch_add(op.len() as u64, Ordering::Relaxed); } - CigarOp::Ins(len) => { + Kind::Insertion => { self.mapped_ins_count.fetch_add(1, Ordering::Relaxed); self.mapped_ins_bases - .fetch_add(*len as u64, Ordering::Relaxed); + .fetch_add(op.len() as u64, Ordering::Relaxed); } - CigarOp::Del(len) => { + Kind::Deletion => { self.mapped_del_count.fetch_add(1, Ordering::Relaxed); self.mapped_del_bases - .fetch_add(*len as u64, Ordering::Relaxed); + .fetch_add(op.len() as u64, Ordering::Relaxed); } - CigarOp::RefSkip(_) | CigarOp::SoftClip(_) | CigarOp::HardClip(_) => {} + Kind::Skip | Kind::SoftClip | Kind::HardClip | Kind::Pad => {} } } @@ -761,6 +764,7 @@ mod tests { fn test_record_transcript_stats() { use crate::align::score::SpliceMotif; use crate::align::transcript::Exon; + use cigar::op::{Kind, Op}; let stats = AlignmentStats::new(); @@ -778,14 +782,14 @@ mod tests { i_frag: 0, }], cigar: vec![ - CigarOp::SoftClip(5), - CigarOp::Match(45), - CigarOp::Ins(3), - CigarOp::Match(2), - CigarOp::RefSkip(100), - CigarOp::Match(50), - CigarOp::Del(2), - CigarOp::Match(5), + Op::new(Kind::SoftClip, 5), + Op::new(Kind::Match, 45), + Op::new(Kind::Insertion, 3), + Op::new(Kind::Match, 2), + Op::new(Kind::Skip, 100), + Op::new(Kind::Match, 50), + Op::new(Kind::Deletion, 2), + Op::new(Kind::Match, 5), ], score: 100, n_mismatch: 3, @@ -832,6 +836,7 @@ mod tests { fn test_splice_motif_aggregation() { use crate::align::score::SpliceMotif; use crate::align::transcript::Exon; + use cigar::op::{Kind, Op}; let stats = AlignmentStats::new(); @@ -848,7 +853,7 @@ mod tests { read_end: 100, i_frag: 0, }], - cigar: vec![CigarOp::Match(100)], + cigar: vec![Op::new(Kind::Match, 100)], score: 100, n_mismatch: 0, n_gap: 0, From 03165a2e2c2aae4c974dd59094163c5b485622e0 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Sat, 16 May 2026 01:03:33 +0200 Subject: [PATCH 3/4] simplify stitching --- src/align/stitch.rs | 48 ++++++++++++-------------------------- src/align/transcript.rs | 9 +++++++ src/quant/transcriptome.rs | 10 ++++---- 3 files changed, 28 insertions(+), 39 deletions(-) diff --git a/src/align/stitch.rs b/src/align/stitch.rs index c504964..66f6273 100644 --- a/src/align/stitch.rs +++ b/src/align/stitch.rs @@ -1,7 +1,7 @@ //! Seed clustering and stitching via dynamic programming use crate::align::score::AlignmentScorer; use crate::align::seed::Seed; -use crate::align::transcript::Transcript; +use crate::align::transcript::{CigarOpExt, Transcript}; use crate::error::Error; use crate::index::GenomeIndex; use noodles::sam::alignment::record::cigar; @@ -1754,6 +1754,16 @@ pub(crate) fn finalize_transcript( use cigar::op::{Kind, Op}; + fn append_match(ops: &mut Vec, len: usize) { + if let Some(op) = ops.last_mut() + && op.kind() == Kind::Match + { + *op = op.with_added_len(len); + } else { + ops.push(Op::new(Kind::Match, len)); + } + } + // Build final CIGAR from exon blocks let mut final_cigar: Vec = Vec::new(); @@ -1779,14 +1789,7 @@ pub(crate) fn finalize_transcript( // Shared match bases before the gap let shared = read_gap.max(0) as usize; if shared > 0 { - // TODO: replace with “extend_or_push_match” function call - if let Some(op) = final_cigar.last_mut() - && op.kind() == Kind::Match - { - *op = Op::new(Kind::Match, op.len() + shared); - } else { - final_cigar.push(Op::new(Kind::Match, shared)); - } + append_match(&mut final_cigar, shared); } let del = (genome_gap - read_gap.max(0)) as usize; if del >= scorer.align_intron_min as usize @@ -1800,14 +1803,7 @@ pub(crate) fn finalize_transcript( // Insertion let shared = genome_gap.max(0) as usize; if shared > 0 { - // TODO: replace with “extend_or_push_match” function call - if let Some(op) = final_cigar.last_mut() - && op.kind() == Kind::Match - { - *op = Op::new(Kind::Match, op.len() + shared); - } else { - final_cigar.push(Op::new(Kind::Match, shared)); - } + append_match(&mut final_cigar, shared); } let ins = (read_gap - genome_gap.max(0)) as usize; final_cigar.push(Op::new(Kind::Insertion, ins)); @@ -1818,27 +1814,13 @@ pub(crate) fn finalize_transcript( // This exon's match region let match_len = exon.read_end - exon.read_start; if match_len > 0 { - // TODO: replace with “extend_or_push_match” function call - if let Some(op) = final_cigar.last_mut() - && op.kind() == Kind::Match - { - *op = Op::new(Kind::Match, op.len() + match_len); - } else { - final_cigar.push(Op::new(Kind::Match, match_len)); - } + append_match(&mut final_cigar, match_len); } } // Right extension match if right_extend.extend_len > 0 { - // TODO: replace with “extend_or_push_match” function call - if let Some(op) = final_cigar.last_mut() - && op.kind() == Kind::Match - { - *op = Op::new(Kind::Match, op.len() + right_extend.extend_len); - } else { - final_cigar.push(Op::new(Kind::Match, right_extend.extend_len)); - } + append_match(&mut final_cigar, right_extend.extend_len); } // Right soft clip diff --git a/src/align/transcript.rs b/src/align/transcript.rs index ea53c3e..ca91ff5 100644 --- a/src/align/transcript.rs +++ b/src/align/transcript.rs @@ -55,6 +55,15 @@ pub struct Exon { pub i_frag: u8, } +pub(crate) trait CigarOpExt { + fn with_added_len(&self, len: usize) -> Self; +} +impl CigarOpExt for cigar::Op { + fn with_added_len(&self, len: usize) -> Self { + cigar::Op::new(self.kind(), self.len() + len) + } +} + fn cigar_char(kind: cigar::op::Kind) -> char { use cigar::op::Kind; match kind { diff --git a/src/quant/transcriptome.rs b/src/quant/transcriptome.rs index 27dfc1f..3c8e8a7 100644 --- a/src/quant/transcriptome.rs +++ b/src/quant/transcriptome.rs @@ -10,7 +10,7 @@ //! only substantive divergence that rustar-aligner builds the transcript tables on //! the fly from the input GTF instead of loading persisted //! `transcriptInfo.tab`/`exonInfo.tab` files. -use crate::align::transcript::{Exon, Transcript}; +use crate::align::transcript::{CigarOpExt, Exon, Transcript}; use crate::error::Error; use crate::genome::Genome; use crate::junction::gtf::GtfRecord; @@ -1301,8 +1301,7 @@ fn rebuild_cigar_without_softclips( if i == 0 && left_clip > 0 { // Fold left_clip into the first op if it's match-like. match op.kind() { - Kind::Match => out.push(Op::new(Kind::Match, op.len() + left_clip)), - Kind::SequenceMatch => out.push(Op::new(Kind::SequenceMatch, op.len() + left_clip)), + Kind::Match | Kind::SequenceMatch => out.push(op.with_added_len(left_clip)), _ => { // Extension landed on a non-match op (shouldn't normally // happen). Emit as Match. @@ -1312,9 +1311,8 @@ fn rebuild_cigar_without_softclips( } } else if i + 1 == body.len() && right_clip > 0 { match op.kind() { - Kind::Match => out.push(Op::new(Kind::Match, op.len() + right_clip)), - Kind::SequenceMatch => { - out.push(Op::new(Kind::SequenceMatch, op.len() + right_clip)); + Kind::Match | Kind::SequenceMatch => { + out.push(op.with_added_len(right_clip)); } _ => { out.push(*op); From 964566cac2474c6fd8d2163f16016070e7efc13e Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Sat, 16 May 2026 01:10:29 +0200 Subject: [PATCH 4/4] missed one --- src/io/sam.rs | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/io/sam.rs b/src/io/sam.rs index 19ec856..9ae7611 100644 --- a/src/io/sam.rs +++ b/src/io/sam.rs @@ -1,6 +1,6 @@ //! SAM/BAM output writer with noodles use crate::align::read_align::PairedAlignment; -use crate::align::transcript::Transcript; +use crate::align::transcript::{Transcript, cigar_to_string}; use crate::error::Error; use crate::genome::Genome; use crate::io::fastq::{complement_base, decode_base}; @@ -150,17 +150,13 @@ impl SamWriter { .name() .map(|n| String::from_utf8_lossy(n.as_ref()).to_string()) .unwrap_or_default(); - let cigar_str = cigar_ops.iter().fold(String::new(), |mut c, op| { - let _ = write!(c, "{}{:?}", op.len(), op.kind()); // infallible - c - }); panic!( "[SAM-MISMATCH] read={} cigar_query_len={} seq_len={} flags={:?} cigar={}", name, cigar_query_len, seq_len, record.flags(), - cigar_str + cigar_to_string(cigar_ops) ); } self.writer.write_alignment_record(&self.header, record)?;