diff --git a/src/align/mod.rs b/src/align/mod.rs index 34c9cd5..1679ced 100644 --- a/src/align/mod.rs +++ b/src/align/mod.rs @@ -10,4 +10,4 @@ pub use read_align::{ }; pub use seed::Seed; pub use stitch::{SeedCluster, WindowAlignment, stitch_seeds}; -pub use transcript::{CigarOp, Exon, Transcript}; +pub use transcript::{Exon, Transcript}; diff --git a/src/align/read_align.rs b/src/align/read_align.rs index 91321ad..76783c3 100644 --- a/src/align/read_align.rs +++ b/src/align/read_align.rs @@ -5,12 +5,14 @@ use crate::align::stitch::{ PE_SPACER_BASE, cluster_seeds, finalize_transcript, split_combined_wt, stitch_seeds_core, stitch_seeds_with_jdb_debug, }; -use crate::align::transcript::{Exon, Transcript}; +use crate::align::transcript::{Exon, KindExt as _, Transcript}; use crate::error::Error; use crate::index::GenomeIndex; use crate::params::{IntronMotifFilter, IntronStrandFilter, Parameters}; use crate::stats::UnmappedReason; +use noodles::sam::alignment::record::cigar; use rand::{SeedableRng, rngs::StdRng, seq::SliceRandom}; +use std::cmp::Ordering; use std::hash::{DefaultHasher, Hash, Hasher}; /// Derive a deterministic per-read RNG seed from `run_rng_seed` + the read name. @@ -323,7 +325,6 @@ pub fn align_read( } else { "unknown" }; - let cigar_str: String = t.cigar.iter().map(|op| format!("{}", op)).collect(); eprintln!( " transcript[{}]: chr={}:{}-{} ({}) score={} mm={} junctions={} cigar={}", ti, @@ -334,7 +335,7 @@ pub fn align_read( t.score, t.n_mismatch, t.n_junction, - cigar_str + t.cigar_string() ); } } @@ -351,20 +352,9 @@ pub fn align_read( // Deduplicate transcripts with identical genomic coordinates AND CIGAR. transcripts.sort_by(|a, b| { - ( - a.chr_idx, - a.genome_start, - a.genome_end, - a.is_reverse, - &a.cigar, - ) - .cmp(&( - b.chr_idx, - b.genome_start, - b.genome_end, - b.is_reverse, - &b.cigar, - )) + (a.chr_idx, a.genome_start, a.genome_end, a.is_reverse) + .cmp(&(b.chr_idx, b.genome_start, b.genome_end, b.is_reverse)) + .then_with(|| cmp_cigar(&a.cigar, &b.cigar)) .then_with(|| b.score.cmp(&a.score)) }); transcripts.dedup_by(|a, b| { @@ -467,13 +457,13 @@ pub fn align_read( // Absolute matched bases let n_matched = t.n_matched(); - if n_matched < params.out_filter_match_nmin { + if n_matched < params.out_filter_match_nmin as usize { *filter_reasons.entry("match_min").or_insert(0) += 1; return false; } // Relative matched bases: STAR casts to uint (u32) - if n_matched < (params.out_filter_match_nmin_over_lread * lread_m1) as u32 { + if (n_matched as f64) < params.out_filter_match_nmin_over_lread * lread_m1 { *filter_reasons.entry("match_min_relative").or_insert(0) += 1; return false; } @@ -519,7 +509,6 @@ pub fn align_read( match motif.implied_strand() { Some('+') => has_plus = true, Some('-') => has_minus = true, - None => {} _ => {} } } @@ -633,7 +622,6 @@ pub fn align_read( } else { "unknown" }; - let cigar_str: String = t.cigar.iter().map(|op| format!("{}", op)).collect(); eprintln!( " FINAL[{}]: chr={}:{}-{} ({}) score={} mm={} junctions={} cigar={}", i, @@ -644,7 +632,7 @@ pub fn align_read( t.score, t.n_mismatch, t.n_junction, - cigar_str + t.cigar_string() ); } } @@ -989,8 +977,8 @@ pub fn align_paired_read( } b.combined_wt_score .cmp(&a.combined_wt_score) - .then_with(|| a.mate1_transcript.cigar.cmp(&b.mate1_transcript.cigar)) - .then_with(|| a.mate2_transcript.cigar.cmp(&b.mate2_transcript.cigar)) + .then_with(|| cmp_cigar(&a.mate1_transcript.cigar, &b.mate1_transcript.cigar)) + .then_with(|| cmp_cigar(&a.mate2_transcript.cigar, &b.mate2_transcript.cigar)) }); joint_pairs.dedup_by(|a, b| { a.mate1_transcript.chr_idx == b.mate1_transcript.chr_idx @@ -1234,6 +1222,18 @@ pub fn align_paired_read( } } +/// Comparator for deduplicating structs containing CIGAR ops via sort→dedup +fn cmp_cigar(a: &[cigar::Op], b: &[cigar::Op]) -> Ordering { + a.len().cmp(&b.len()).then_with(|| { + (a.iter().zip(b.iter())) + .find_map(|(a, b)| { + let ord = (a.char(), a.len()).cmp(&(b.char(), b.len())); + (ord != Ordering::Equal).then_some(ord) + }) + .unwrap_or(Ordering::Equal) + }) +} + /// Attempt to pair two per-mate transcripts into a PairedAlignment. /// /// Returns `None` if the mates are incompatible (same strand, different chr, too far, etc.). @@ -1416,18 +1416,18 @@ fn extract_junctions_from_cigar(t: &Transcript) -> Vec<(u64, u64)> { let mut junctions = Vec::new(); let mut genome_pos = t.genome_start; for op in &t.cigar { - use crate::align::transcript::CigarOp; - match op { - CigarOp::Match(n) | CigarOp::Equal(n) | CigarOp::Diff(n) | CigarOp::Del(n) => { - genome_pos += *n as u64; + use noodles::sam::alignment::record::cigar::op::Kind; + match op.kind() { + Kind::Match | Kind::SequenceMatch | Kind::SequenceMismatch | Kind::Deletion => { + genome_pos += op.len() as u64; } - CigarOp::RefSkip(n) => { + Kind::Skip => { let donor = genome_pos; - let acceptor = genome_pos + *n as u64; + let acceptor = genome_pos + op.len() as u64; junctions.push((donor, acceptor)); genome_pos = acceptor; } - CigarOp::Ins(_) | CigarOp::SoftClip(_) | CigarOp::HardClip(_) => {} + Kind::Insertion | Kind::SoftClip | Kind::HardClip | Kind::Pad => {} } } junctions @@ -1479,6 +1479,7 @@ mod tests { use crate::index::sa_index::SaIndex; use crate::index::suffix_array::SuffixArray; use clap::Parser; + use noodles::sam::alignment::record::cigar; fn make_test_params() -> Parameters { // Parse empty args to get default parameters @@ -1537,8 +1538,7 @@ mod tests { #[test] fn combined_transcript_for_projection_rewrites_mate2_ifrag() { - use crate::align::transcript::CigarOp; - + use cigar::op::{Kind, Op}; let make_tr = |gs: u64, ge: u64, rs: usize, re: usize| Transcript { chr_idx: 0, genome_start: gs, @@ -1551,7 +1551,7 @@ mod tests { read_end: re, i_frag: 0, }], - cigar: vec![CigarOp::Match((ge - gs) as u32)], + cigar: vec![Op::new(Kind::Match, (ge - gs) as usize)], score: 100, n_mismatch: 0, n_gap: 0, @@ -1654,7 +1654,8 @@ mod tests { #[test] fn test_check_proper_pair_distance() { - use crate::align::transcript::{CigarOp, Exon}; + use crate::align::transcript::Exon; + use cigar::op::{Kind, Op}; let params = make_test_params(); @@ -1671,7 +1672,7 @@ mod tests { read_end: 100, i_frag: 0, }], - cigar: vec![CigarOp::Match(100)], + cigar: vec![Op::new(Kind::Match, 100)], score: 100, n_mismatch: 0, n_gap: 0, @@ -1693,7 +1694,7 @@ mod tests { read_end: 100, i_frag: 0, }], - cigar: vec![CigarOp::Match(100)], + cigar: vec![Op::new(Kind::Match, 100)], score: 100, n_mismatch: 0, n_gap: 0, @@ -1709,7 +1710,8 @@ mod tests { #[test] fn test_check_proper_pair_too_far() { - use crate::align::transcript::{CigarOp, Exon}; + use crate::align::transcript::Exon; + use cigar::op::{Kind, Op}; let mut params = make_test_params(); params.align_mates_gap_max = 100; @@ -1726,7 +1728,7 @@ mod tests { read_end: 100, i_frag: 0, }], - cigar: vec![CigarOp::Match(100)], + cigar: vec![Op::new(Kind::Match, 100)], score: 100, n_mismatch: 0, n_gap: 0, @@ -1748,7 +1750,7 @@ mod tests { read_end: 100, i_frag: 0, }], - cigar: vec![CigarOp::Match(100)], + cigar: vec![Op::new(Kind::Match, 100)], score: 100, n_mismatch: 0, n_gap: 0, @@ -1764,7 +1766,8 @@ mod tests { #[test] fn test_calculate_insert_size_positive() { - use crate::align::transcript::{CigarOp, Exon}; + use crate::align::transcript::Exon; + use cigar::op::{Kind, Op}; // Mate1 is leftmost let t1 = Transcript { @@ -1779,7 +1782,7 @@ mod tests { read_end: 100, i_frag: 0, }], - cigar: vec![CigarOp::Match(100)], + cigar: vec![Op::new(Kind::Match, 100)], score: 100, n_mismatch: 0, n_gap: 0, @@ -1801,7 +1804,7 @@ mod tests { read_end: 100, i_frag: 0, }], - cigar: vec![CigarOp::Match(100)], + cigar: vec![Op::new(Kind::Match, 100)], score: 100, n_mismatch: 0, n_gap: 0, @@ -1817,8 +1820,9 @@ mod tests { #[test] fn test_strand_consistency_filter() { - use crate::align::transcript::{CigarOp, Exon, Transcript}; + use crate::align::transcript::{Exon, Transcript}; use crate::params::IntronStrandFilter; + use cigar::op::{Kind, Op}; // Create a transcript with conflicting strand motifs (mixed + and - within one transcript) let t_inconsistent = Transcript { @@ -1833,7 +1837,7 @@ mod tests { read_end: 100, i_frag: 0, }], - cigar: vec![CigarOp::Match(100)], + cigar: vec![Op::new(Kind::Match, 100)], score: 100, n_mismatch: 0, n_gap: 0, @@ -1856,7 +1860,7 @@ mod tests { read_end: 100, i_frag: 0, }], - cigar: vec![CigarOp::Match(100)], + cigar: vec![Op::new(Kind::Match, 100)], score: 100, n_mismatch: 0, n_gap: 0, @@ -1917,7 +1921,8 @@ mod tests { #[test] fn test_calculate_insert_size_negative() { - use crate::align::transcript::{CigarOp, Exon}; + use crate::align::transcript::Exon; + use cigar::op::{Kind, Op}; // RF pair: mate1 reverse (right side), mate2 forward (left side). // STAR reverse cluster: tlen = mate1.genome_end - mate2.genome_start = 1300 - 1000 = 300. @@ -1934,7 +1939,7 @@ mod tests { read_end: 100, i_frag: 0, }], - cigar: vec![CigarOp::Match(100)], + cigar: vec![Op::new(Kind::Match, 100)], score: 100, n_mismatch: 0, n_gap: 0, @@ -1956,7 +1961,7 @@ mod tests { read_end: 100, i_frag: 0, }], - cigar: vec![CigarOp::Match(100)], + cigar: vec![Op::new(Kind::Match, 100)], score: 100, n_mismatch: 0, n_gap: 0, @@ -1973,7 +1978,8 @@ mod tests { #[test] fn test_noncanonical_unannotated_filter() { use crate::align::score::SpliceMotif; - use crate::align::transcript::{CigarOp, Exon, Transcript}; + use crate::align::transcript::{Exon, Transcript}; + use cigar::op::{Kind, Op}; // Helper: check if a transcript would be filtered by RemoveNoncanonicalUnannotated // (mirrors the logic in the retain closure) @@ -1996,7 +2002,7 @@ mod tests { read_end: 100, i_frag: 0, }], - cigar: vec![CigarOp::Match(100)], + cigar: vec![Op::new(Kind::Match, 100)], score: 100, n_mismatch: 0, n_gap: 0, @@ -2067,7 +2073,8 @@ mod tests { #[test] fn test_paired_alignment_result_enum_variants() { - use crate::align::transcript::{CigarOp, Exon}; + use crate::align::transcript::Exon; + use cigar::op::{Kind, Op}; let transcript = Transcript { chr_idx: 0, @@ -2081,7 +2088,7 @@ mod tests { read_end: 100, i_frag: 0, }], - cigar: vec![CigarOp::Match(100)], + cigar: vec![Op::new(Kind::Match, 100)], score: 100, n_mismatch: 0, n_gap: 0, diff --git a/src/align/stitch.rs b/src/align/stitch.rs index 4d4e81b..c9b27b4 100644 --- a/src/align/stitch.rs +++ b/src/align/stitch.rs @@ -1,9 +1,10 @@ -/// Seed clustering and stitching via dynamic programming +//! Seed clustering and stitching via dynamic programming use crate::align::score::AlignmentScorer; use crate::align::seed::Seed; -use crate::align::transcript::{CigarOp, Transcript}; +use crate::align::transcript::{CigarOpExt as _, Transcript}; use crate::error::Error; use crate::index::GenomeIndex; +use noodles::sam::alignment::record::cigar; /// STAR's MARK_FRAG_SPACER_BASE (IncludeDefine.h:174). /// Separates mate1 and mate2 fragments in the combined PE read. @@ -104,12 +105,13 @@ fn score_region( fn count_mismatches( read_seq: &[u8], - cigar_ops: &[CigarOp], + cigar_ops: &[cigar::Op], genome_start: u64, read_start: usize, index: &GenomeIndex, is_reverse: bool, ) -> u32 { + use cigar::op::Kind; // Add n_genome offset for reverse-strand genome access let genome_offset = if is_reverse { index.genome.n_genome } else { 0 }; @@ -118,9 +120,9 @@ fn count_mismatches( let mut genome_pos = genome_start; for op in cigar_ops { - match op { - CigarOp::Match(len) | CigarOp::Equal(len) | CigarOp::Diff(len) => { - for _i in 0..*len { + match op.kind() { + Kind::Match | Kind::SequenceMatch | Kind::SequenceMismatch => { + for _i in 0..op.len() { if read_pos < read_seq.len() { let read_base = read_seq[read_pos]; if let Some(genome_base) = index.genome.get_base(genome_pos + genome_offset) @@ -135,16 +137,13 @@ fn count_mismatches( genome_pos += 1; } } - CigarOp::Ins(len) => { - read_pos += *len as usize; + Kind::Insertion | Kind::SoftClip => { + read_pos += op.len(); } - CigarOp::Del(len) | CigarOp::RefSkip(len) => { - genome_pos += *len as u64; + Kind::Deletion | Kind::Skip => { + genome_pos += op.len() as u64; } - CigarOp::SoftClip(len) => { - read_pos += *len as usize; - } - CigarOp::HardClip(_) => {} + Kind::HardClip | Kind::Pad => {} } } @@ -1753,18 +1752,30 @@ pub(crate) fn finalize_transcript( } } + use cigar::op::{Kind, Op}; + + fn append_match(ops: &mut Vec, len: usize) { + if let Some(op) = ops.last_mut() + && op.kind() == Kind::Match + { + *op = op.add_len(len); + } else { + ops.push(Op::new(Kind::Match, len)); + } + } + // Build final CIGAR from exon blocks - let mut final_cigar: Vec = Vec::new(); + let mut final_cigar: Vec = Vec::new(); // Left soft clip let remaining_left_clip = alignment_start - left_extend.extend_len; if remaining_left_clip > 0 { - final_cigar.push(CigarOp::SoftClip(remaining_left_clip as u32)); + final_cigar.push(Op::new(Kind::SoftClip, remaining_left_clip)); } // Left extension match if left_extend.extend_len > 0 { - final_cigar.push(CigarOp::Match(left_extend.extend_len as u32)); + final_cigar.push(Op::new(Kind::Match, left_extend.extend_len)); } // Walk exon blocks to build CIGAR @@ -1776,69 +1787,55 @@ pub(crate) fn finalize_transcript( if genome_gap > read_gap && genome_gap > 0 { // Shared match bases before the gap - let shared = read_gap.max(0) as u32; + let shared = read_gap.max(0) as usize; if shared > 0 { - if let Some(CigarOp::Match(prev_len)) = final_cigar.last_mut() { - *prev_len += shared; - } else { - final_cigar.push(CigarOp::Match(shared)); - } + append_match(&mut final_cigar, shared); } - let del = (genome_gap - read_gap.max(0)) as u32; - if del >= scorer.align_intron_min && del <= scorer.align_intron_max { - final_cigar.push(CigarOp::RefSkip(del)); + let del = (genome_gap - read_gap.max(0)) as usize; + if del >= scorer.align_intron_min as usize + && del <= scorer.align_intron_max as usize + { + final_cigar.push(Op::new(Kind::Skip, del)); } else { - final_cigar.push(CigarOp::Del(del)); + final_cigar.push(Op::new(Kind::Deletion, del)); } } else if read_gap > genome_gap && read_gap > 0 { // Insertion - let shared = genome_gap.max(0) as u32; + let shared = genome_gap.max(0) as usize; if shared > 0 { - if let Some(CigarOp::Match(prev_len)) = final_cigar.last_mut() { - *prev_len += shared; - } else { - final_cigar.push(CigarOp::Match(shared)); - } + append_match(&mut final_cigar, shared); } - let ins = (read_gap - genome_gap.max(0)) as u32; - final_cigar.push(CigarOp::Ins(ins)); + let ins = (read_gap - genome_gap.max(0)) as usize; + final_cigar.push(Op::new(Kind::Insertion, ins)); } // Equal gap case is handled by extended exon blocks in stitch_align_to_transcript } // This exon's match region - let match_len = (exon.read_end - exon.read_start) as u32; + let match_len = exon.read_end - exon.read_start; if match_len > 0 { - if let Some(CigarOp::Match(prev_len)) = final_cigar.last_mut() { - *prev_len += match_len; - } else { - final_cigar.push(CigarOp::Match(match_len)); - } + append_match(&mut final_cigar, match_len); } } // Right extension match if right_extend.extend_len > 0 { - if let Some(CigarOp::Match(prev_len)) = final_cigar.last_mut() { - *prev_len += right_extend.extend_len as u32; - } else { - final_cigar.push(CigarOp::Match(right_extend.extend_len as u32)); - } + append_match(&mut final_cigar, right_extend.extend_len); } // Right soft clip let remaining_right_clip = (read_seq.len() - alignment_end) - right_extend.extend_len; if remaining_right_clip > 0 { - final_cigar.push(CigarOp::SoftClip(remaining_right_clip as u32)); + final_cigar.push(Op::new(Kind::SoftClip, remaining_right_clip)); } // Validate CIGAR read-consuming length - let cigar_read_len: u32 = final_cigar + let cigar_read_len: usize = final_cigar .iter() - .filter(|op| op.consumes_query()) + .filter(|op| op.kind().consumes_read()) .map(|op| op.len()) .sum(); - if cigar_read_len != read_seq.len() as u32 { + if cigar_read_len != read_seq.len() { // Invalid CIGAR: exon block geometry is inconsistent with read length. // This can occur when a combined PE read's WorkingTranscript has exon // positions that span both mates or include the spacer region. Silently @@ -1870,13 +1867,13 @@ pub(crate) fn finalize_transcript( // Compute total reference-consuming length from CIGAR let mut ref_len = 0u64; for op in &final_cigar { - match op { - CigarOp::Match(len) - | CigarOp::Equal(len) - | CigarOp::Diff(len) - | CigarOp::Del(len) - | CigarOp::RefSkip(len) => { - ref_len += *len as u64; + match op.kind() { + Kind::Match + | Kind::SequenceMatch + | Kind::SequenceMismatch + | Kind::Deletion + | Kind::Skip => { + ref_len += op.len() as u64; } _ => {} } @@ -1898,9 +1895,9 @@ pub(crate) fn finalize_transcript( let mut genome_pos_e = forward_genome_start; for op in &final_cigar { - match op { - CigarOp::Match(len) | CigarOp::Equal(len) | CigarOp::Diff(len) => { - let len = *len as usize; + match op.kind() { + Kind::Match | Kind::SequenceMatch | Kind::SequenceMismatch => { + let len = op.len(); exons.push(Exon { genome_start: genome_pos_e, genome_end: genome_pos_e + len as u64, @@ -1913,19 +1910,13 @@ pub(crate) fn finalize_transcript( read_pos_e += len; genome_pos_e += len as u64; } - CigarOp::Ins(len) => { - read_pos_e += *len as usize; - } - CigarOp::Del(len) => { - genome_pos_e += *len as u64; - } - CigarOp::RefSkip(len) => { - genome_pos_e += *len as u64; + Kind::Insertion | Kind::SoftClip => { + read_pos_e += op.len(); } - CigarOp::SoftClip(len) => { - read_pos_e += *len as usize; + Kind::Deletion | Kind::Skip => { + genome_pos_e += op.len() as u64; } - CigarOp::HardClip(_) => {} + Kind::HardClip | Kind::Pad => {} } } diff --git a/src/align/transcript.rs b/src/align/transcript.rs index efc8de7..b611f52 100644 --- a/src/align/transcript.rs +++ b/src/align/transcript.rs @@ -1,5 +1,6 @@ -/// Transcript data structures for storing alignment results -use std::fmt; +//! Transcript data structures for storing alignment results +use noodles::sam::alignment::record::cigar; +use std::fmt::Write as _; /// A complete alignment of a read to the genome #[derive(Debug, Clone)] @@ -15,7 +16,7 @@ pub struct Transcript { /// Exon segments pub exons: Vec, /// CIGAR operations - pub cigar: Vec, + pub cigar: Vec, /// Alignment score pub score: i32, /// Number of mismatches @@ -54,145 +55,92 @@ pub struct Exon { pub i_frag: u8, } -/// CIGAR operation -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] -pub enum CigarOp { - /// M: match/mismatch (default mode) - Match(u32), - /// =: exact match (optional) - Equal(u32), - /// X: mismatch (optional) - Diff(u32), - /// I: insertion to reference - Ins(u32), - /// D: deletion from reference - Del(u32), - /// N: splice junction (skipped reference region) - RefSkip(u32), - /// S: soft clip (clipped sequence present in read) - SoftClip(u32), - /// H: hard clip (clipped sequence not present) - HardClip(u32), +pub(crate) trait CigarOpExt { + fn add_len(&self, len: usize) -> Self; } - -impl CigarOp { - /// Get the operation character - pub fn op_char(&self) -> char { - match self { - CigarOp::Match(_) => 'M', - CigarOp::Equal(_) => '=', - CigarOp::Diff(_) => 'X', - CigarOp::Ins(_) => 'I', - CigarOp::Del(_) => 'D', - CigarOp::RefSkip(_) => 'N', - CigarOp::SoftClip(_) => 'S', - CigarOp::HardClip(_) => 'H', - } +impl CigarOpExt for cigar::Op { + fn add_len(&self, len: usize) -> Self { + cigar::Op::new(self.kind(), self.len() + len) } +} - /// Get the operation length - pub fn len(&self) -> u32 { +pub(crate) trait KindExt { + fn char(self) -> char; +} +impl KindExt for cigar::op::Kind { + fn char(self) -> char { + use cigar::op::Kind; match self { - CigarOp::Match(n) - | CigarOp::Equal(n) - | CigarOp::Diff(n) - | CigarOp::Ins(n) - | CigarOp::Del(n) - | CigarOp::RefSkip(n) - | CigarOp::SoftClip(n) - | CigarOp::HardClip(n) => *n, + Kind::Match => 'M', + Kind::Insertion => 'I', + Kind::Deletion => 'D', + Kind::Skip => 'N', + Kind::SoftClip => 'S', + Kind::HardClip => 'H', + Kind::Pad => 'P', + Kind::SequenceMatch => '=', + Kind::SequenceMismatch => 'X', } } - - /// Check if operation is empty - pub fn is_empty(&self) -> bool { - self.len() == 0 - } - - /// Check if operation consumes query bases - pub fn consumes_query(&self) -> bool { - matches!( - self, - CigarOp::Match(_) - | CigarOp::Equal(_) - | CigarOp::Diff(_) - | CigarOp::Ins(_) - | CigarOp::SoftClip(_) - ) - } - - /// Check if operation consumes reference bases - pub fn consumes_reference(&self) -> bool { - matches!( - self, - CigarOp::Match(_) - | CigarOp::Equal(_) - | CigarOp::Diff(_) - | CigarOp::Del(_) - | CigarOp::RefSkip(_) - ) +} +impl KindExt for cigar::Op { + fn char(self) -> char { + self.kind().char() } } -impl fmt::Display for CigarOp { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{}{}", self.len(), self.op_char()) - } +/// Convert CIGAR operations to CIGAR string +pub(crate) fn cigar_to_string(cigar: &[cigar::Op]) -> String { + cigar.iter().fold(String::new(), |mut c, op| { + let _ = write!(c, "{}{}", op.len(), op.char()); // infallible + c + }) } impl Transcript { /// Format CIGAR string pub fn cigar_string(&self) -> String { - self.cigar.iter().map(|op| op.to_string()).collect() + cigar_to_string(&self.cigar) } /// Calculate number of matched bases (for filtering) - pub fn n_matched(&self) -> u32 { + pub fn n_matched(&self) -> usize { + use cigar::op::Kind; self.cigar .iter() - .filter_map(|op| match op { - CigarOp::Match(n) | CigarOp::Equal(n) => Some(*n), + .filter_map(|op| match op.kind() { + Kind::Match | Kind::SequenceMatch => Some(op.len()), _ => None, }) .sum() } /// Calculate read length from CIGAR - pub fn read_length(&self) -> u32 { + pub fn read_length(&self) -> usize { self.cigar .iter() - .filter(|op| op.consumes_query()) + .filter(|op| op.kind().consumes_read()) .map(|op| op.len()) .sum() } /// Calculate reference length from CIGAR - pub fn reference_length(&self) -> u32 { + pub fn reference_length(&self) -> usize { self.cigar .iter() - .filter(|op| op.consumes_reference()) + .filter(|op| op.kind().consumes_reference()) .map(|op| op.len()) .sum() } /// Count soft-clipped bases on left and right ends /// - /// Returns (left_clip, right_clip) in bases - pub fn count_soft_clips(&self) -> (u32, u32) { - let mut left_clip = 0u32; - let mut right_clip = 0u32; - - // Check first operation for left clip - if let Some(CigarOp::SoftClip(n)) = self.cigar.first() { - left_clip = *n; - } - - // Check last operation for right clip - if let Some(CigarOp::SoftClip(n)) = self.cigar.last() { - right_clip = *n; - } - - (left_clip, right_clip) + /// Returns `[left_clip, right_clip]` in bases + pub fn count_soft_clips(&self) -> [usize; 2] { + [self.cigar.first(), self.cigar.last()].map(|c| { + c.filter(|c| c.kind() == cigar::op::Kind::SoftClip) + .map_or(0, |c| c.len()) + }) } } @@ -201,16 +149,22 @@ mod tests { use super::*; #[test] - fn test_cigar_op_display() { - assert_eq!(CigarOp::Match(50).to_string(), "50M"); - assert_eq!(CigarOp::Ins(3).to_string(), "3I"); - assert_eq!(CigarOp::Del(2).to_string(), "2D"); - assert_eq!(CigarOp::RefSkip(1000).to_string(), "1000N"); - assert_eq!(CigarOp::SoftClip(5).to_string(), "5S"); + fn test_cigar_to_string() { + use cigar::op::{Kind, Op}; + let cigar = vec![ + Op::new(Kind::Match, 50), + Op::new(Kind::Insertion, 2), + Op::new(Kind::Deletion, 3), + Op::new(Kind::Skip, 1000), + Op::new(Kind::SoftClip, 5), + ]; + + assert_eq!(cigar_to_string(&cigar), "50M2I3D1000N5S"); } #[test] fn test_cigar_string() { + use cigar::op::{Kind, Op}; let transcript = Transcript { chr_idx: 0, genome_start: 100, @@ -218,9 +172,9 @@ mod tests { is_reverse: false, exons: vec![], cigar: vec![ - CigarOp::Match(50), - CigarOp::RefSkip(100), - CigarOp::Match(50), + Op::new(Kind::Match, 50), + Op::new(Kind::Skip, 100), + Op::new(Kind::Match, 50), ], score: 100, n_mismatch: 0, @@ -236,24 +190,27 @@ mod tests { #[test] fn test_cigar_consumes() { - assert!(CigarOp::Match(10).consumes_query()); - assert!(CigarOp::Match(10).consumes_reference()); + use cigar::op::Kind; + + assert!(Kind::Match.consumes_read()); + assert!(Kind::Match.consumes_reference()); - assert!(CigarOp::Ins(5).consumes_query()); - assert!(!CigarOp::Ins(5).consumes_reference()); + assert!(Kind::Insertion.consumes_read()); + assert!(!Kind::Insertion.consumes_reference()); - assert!(!CigarOp::Del(3).consumes_query()); - assert!(CigarOp::Del(3).consumes_reference()); + assert!(!Kind::Deletion.consumes_read()); + assert!(Kind::Deletion.consumes_reference()); - assert!(!CigarOp::RefSkip(1000).consumes_query()); - assert!(CigarOp::RefSkip(1000).consumes_reference()); + assert!(!Kind::Skip.consumes_read()); + assert!(Kind::Skip.consumes_reference()); - assert!(CigarOp::SoftClip(5).consumes_query()); - assert!(!CigarOp::SoftClip(5).consumes_reference()); + assert!(Kind::SoftClip.consumes_read()); + assert!(!Kind::SoftClip.consumes_reference()); } #[test] fn test_transcript_lengths() { + use cigar::op::{Kind, Op}; let transcript = Transcript { chr_idx: 0, genome_start: 100, @@ -261,11 +218,11 @@ mod tests { is_reverse: false, exons: vec![], cigar: vec![ - CigarOp::Match(45), - CigarOp::Ins(3), - CigarOp::Match(2), - CigarOp::RefSkip(100), - CigarOp::Match(50), + Op::new(Kind::Match, 45), + Op::new(Kind::Insertion, 3), + Op::new(Kind::Match, 2), + Op::new(Kind::Skip, 100), + Op::new(Kind::Match, 50), ], score: 100, n_mismatch: 0, @@ -304,6 +261,7 @@ mod tests { #[test] fn test_count_soft_clips_both_ends() { + use cigar::op::{Kind, Op}; let transcript = Transcript { chr_idx: 0, genome_start: 100, @@ -311,9 +269,9 @@ mod tests { is_reverse: false, exons: vec![], cigar: vec![ - CigarOp::SoftClip(10), - CigarOp::Match(50), - CigarOp::SoftClip(15), + Op::new(Kind::SoftClip, 10), + Op::new(Kind::Match, 50), + Op::new(Kind::SoftClip, 15), ], score: 100, n_mismatch: 0, @@ -324,20 +282,21 @@ mod tests { read_seq: vec![], }; - let (left, right) = transcript.count_soft_clips(); + let [left, right] = transcript.count_soft_clips(); assert_eq!(left, 10); assert_eq!(right, 15); } #[test] fn test_count_soft_clips_left_only() { + use cigar::op::{Kind, Op}; let transcript = Transcript { chr_idx: 0, genome_start: 100, genome_end: 150, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::SoftClip(20), CigarOp::Match(50)], + cigar: vec![Op::new(Kind::SoftClip, 20), Op::new(Kind::Match, 50)], score: 100, n_mismatch: 0, n_gap: 0, @@ -347,20 +306,21 @@ mod tests { read_seq: vec![], }; - let (left, right) = transcript.count_soft_clips(); + let [left, right] = transcript.count_soft_clips(); assert_eq!(left, 20); assert_eq!(right, 0); } #[test] fn test_count_soft_clips_none() { + use cigar::op::{Kind, Op}; let transcript = Transcript { chr_idx: 0, genome_start: 100, genome_end: 150, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(50)], + cigar: vec![Op::new(Kind::Match, 50)], score: 100, n_mismatch: 0, n_gap: 0, @@ -370,7 +330,7 @@ mod tests { read_seq: vec![], }; - let (left, right) = transcript.count_soft_clips(); + let [left, right] = transcript.count_soft_clips(); assert_eq!(left, 0); assert_eq!(right, 0); } diff --git a/src/chimeric/detect.rs b/src/chimeric/detect.rs index 7127ba5..034750e 100644 --- a/src/chimeric/detect.rs +++ b/src/chimeric/detect.rs @@ -43,7 +43,7 @@ impl<'a> ChimericDetector<'a> { } let read_len = read_seq.len(); - let (left_clip, right_clip) = transcript.count_soft_clips(); + let [left_clip, right_clip] = transcript.count_soft_clips(); let score_min = params.chim_score_min; let score_drop_max = params.chim_score_drop_max; let non_gtag_penalty = params.chim_score_junction_non_gtag; @@ -51,7 +51,7 @@ impl<'a> ChimericDetector<'a> { let overhang_min = params.chim_junction_overhang_min as usize; // Try right clip first, then left (match STAR's ordering) - let candidates = [(right_clip as usize, true), (left_clip as usize, false)]; + let candidates = [(right_clip, true), (left_clip, false)]; for (clip_len, is_right) in candidates { if clip_len < min_seg { @@ -980,7 +980,7 @@ pub(crate) fn transcript_to_segment(transcript: &Transcript) -> Result Transcript { + use cigar::op::{Kind, Op}; let read_len = (genome_end - genome_start) as usize; Transcript { chr_idx, @@ -1048,7 +1050,7 @@ mod tests { read_end: read_len, i_frag: 0, }], - cigar: vec![CigarOp::Match(read_len as u32)], + cigar: vec![Op::new(Kind::Match, read_len)], score: read_len as i32, n_mismatch: 0, n_gap: 0, @@ -1290,14 +1292,15 @@ mod tests { left_clip: usize, right_clip: usize, ) -> Transcript { + use cigar::op::{Kind, Op}; let aligned_len = read_len - left_clip - right_clip; let mut cigar = vec![]; if left_clip > 0 { - cigar.push(CigarOp::SoftClip(left_clip as u32)); + cigar.push(Op::new(Kind::SoftClip, left_clip)); } - cigar.push(CigarOp::Match(aligned_len as u32)); + cigar.push(Op::new(Kind::Match, aligned_len)); if right_clip > 0 { - cigar.push(CigarOp::SoftClip(right_clip as u32)); + cigar.push(Op::new(Kind::SoftClip, right_clip)); } Transcript { chr_idx, diff --git a/src/chimeric/output.rs b/src/chimeric/output.rs index dade145..f7afce5 100644 --- a/src/chimeric/output.rs +++ b/src/chimeric/output.rs @@ -78,8 +78,8 @@ impl ChimericJunctionWriter { let acceptor_start = alignment.acceptor.genome_start + 1; // Convert CIGAR to string - let donor_cigar = cigar_to_string(&alignment.donor.cigar); - let acceptor_cigar = cigar_to_string(&alignment.acceptor.cigar); + let donor_cigar = alignment.donor.cigar_string(); + let acceptor_cigar = alignment.acceptor.cigar_string(); // Write line writeln!( @@ -162,7 +162,7 @@ fn format_sa_entry( let chr_start = chr_starts[seg.chr_idx]; let pos = seg.genome_start - chr_start + 1; // 1-based per-chr let strand = if seg.is_reverse { '-' } else { '+' }; - let cigar = cigar_to_string(&seg.cigar); + let cigar = seg.cigar_string(); format!( "{},{},{},{},{},{};", chr, pos, strand, cigar, mapq, seg.n_mismatch @@ -180,7 +180,6 @@ fn build_segment_record( sa_tag: &str, ) -> Result { use crate::io::fastq::{complement_base, decode_base}; - use crate::io::sam::convert_cigar; use noodles::sam::alignment::record::data::field::Tag; let mut record = RecordBuf::default(); @@ -206,7 +205,7 @@ fn build_segment_record( *record.mapping_quality_mut() = MappingQuality::new(mapq); - *record.cigar_mut() = convert_cigar(&seg.cigar)?; + *record.cigar_mut() = seg.cigar.iter().copied().collect(); // Primary record carries the full read sequence; supplementary uses * (empty). if !is_supplementary { @@ -233,47 +232,14 @@ fn build_segment_record( Ok(record) } -/// Convert CIGAR operations to CIGAR string -fn cigar_to_string(cigar: &[crate::align::transcript::CigarOp]) -> String { - use crate::align::transcript::CigarOp; - - let mut result = String::new(); - for op in cigar { - match op { - CigarOp::Match(len) => result.push_str(&format!("{}M", len)), - CigarOp::Equal(len) => result.push_str(&format!("{}=", len)), - CigarOp::Diff(len) => result.push_str(&format!("{}X", len)), - CigarOp::Ins(len) => result.push_str(&format!("{}I", len)), - CigarOp::Del(len) => result.push_str(&format!("{}D", len)), - CigarOp::RefSkip(len) => result.push_str(&format!("{}N", len)), - CigarOp::SoftClip(len) => result.push_str(&format!("{}S", len)), - CigarOp::HardClip(len) => result.push_str(&format!("{}H", len)), - } - } - result -} - #[cfg(test)] mod tests { use super::*; - use crate::align::transcript::CigarOp; use crate::chimeric::segment::{ChimericAlignment, ChimericSegment}; + use noodles::sam::alignment::record::cigar; use std::io::Read; use tempfile::tempdir; - #[test] - fn test_cigar_to_string() { - let cigar = vec![ - CigarOp::Match(50), - CigarOp::Ins(2), - CigarOp::Del(3), - CigarOp::RefSkip(1000), - CigarOp::SoftClip(5), - ]; - - assert_eq!(cigar_to_string(&cigar), "50M2I3D1000N5S"); - } - #[test] fn test_chimeric_junction_writer_creation() { let dir = tempdir().unwrap(); @@ -289,6 +255,7 @@ mod tests { #[test] fn test_write_inter_chromosomal() { + use cigar::op::{Kind, Op}; let dir = tempdir().unwrap(); let prefix = dir.path().to_str().unwrap(); @@ -302,7 +269,7 @@ mod tests { is_reverse: false, read_start: 0, read_end: 63, - cigar: vec![CigarOp::Match(63)], + cigar: vec![Op::new(Kind::Match, 63)], score: 100, n_mismatch: 2, }; @@ -314,7 +281,7 @@ mod tests { is_reverse: false, read_start: 63, read_end: 100, - cigar: vec![CigarOp::Match(37)], + cigar: vec![Op::new(Kind::Match, 37)], score: 80, n_mismatch: 1, }; @@ -368,6 +335,7 @@ mod tests { #[test] fn test_write_strand_break() { + use cigar::op::{Kind, Op}; let dir = tempdir().unwrap(); let prefix = dir.path().to_str().unwrap(); @@ -381,7 +349,7 @@ mod tests { is_reverse: false, read_start: 0, read_end: 50, - cigar: vec![CigarOp::Match(50)], + cigar: vec![Op::new(Kind::Match, 50)], score: 100, n_mismatch: 1, }; @@ -393,7 +361,7 @@ mod tests { is_reverse: true, read_start: 50, read_end: 100, - cigar: vec![CigarOp::Match(50)], + cigar: vec![Op::new(Kind::Match, 50)], score: 100, n_mismatch: 1, }; @@ -452,6 +420,7 @@ mod tests { #[test] fn test_within_bam_returns_two_records() { + use cigar::op::{Kind, Op}; let donor = ChimericSegment { chr_idx: 0, genome_start: 100, @@ -459,7 +428,7 @@ mod tests { is_reverse: false, read_start: 0, read_end: 63, - cigar: vec![CigarOp::Match(63)], + cigar: vec![Op::new(Kind::Match, 63)], score: 63, n_mismatch: 0, }; @@ -470,7 +439,7 @@ mod tests { is_reverse: false, read_start: 63, read_end: 100, - cigar: vec![CigarOp::Match(37)], + cigar: vec![Op::new(Kind::Match, 37)], score: 37, n_mismatch: 1, }; @@ -491,6 +460,7 @@ mod tests { #[test] fn test_within_bam_donor_not_supplementary() { + use cigar::op::{Kind, Op}; let donor = ChimericSegment { chr_idx: 0, genome_start: 100, @@ -498,7 +468,7 @@ mod tests { is_reverse: false, read_start: 0, read_end: 63, - cigar: vec![CigarOp::Match(63)], + cigar: vec![Op::new(Kind::Match, 63)], score: 63, n_mismatch: 0, }; @@ -509,7 +479,7 @@ mod tests { is_reverse: false, read_start: 63, read_end: 100, - cigar: vec![CigarOp::Match(37)], + cigar: vec![Op::new(Kind::Match, 37)], score: 37, n_mismatch: 1, }; @@ -540,6 +510,7 @@ mod tests { #[test] fn test_within_bam_sa_tag_format() { + use cigar::op::{Kind, Op}; use noodles::sam::alignment::record::data::field::Tag; let donor = ChimericSegment { chr_idx: 0, @@ -548,7 +519,7 @@ mod tests { is_reverse: false, read_start: 0, read_end: 63, - cigar: vec![CigarOp::Match(63)], + cigar: vec![Op::new(Kind::Match, 63)], score: 63, n_mismatch: 2, }; @@ -559,7 +530,7 @@ mod tests { is_reverse: true, read_start: 63, read_end: 100, - cigar: vec![CigarOp::Match(37)], + cigar: vec![Op::new(Kind::Match, 37)], score: 37, n_mismatch: 1, }; @@ -604,6 +575,7 @@ mod tests { #[test] fn test_within_bam_donor_has_sequence() { + use cigar::op::{Kind, Op}; let donor = ChimericSegment { chr_idx: 0, genome_start: 100, @@ -611,7 +583,7 @@ mod tests { is_reverse: false, read_start: 0, read_end: 63, - cigar: vec![CigarOp::Match(63)], + cigar: vec![Op::new(Kind::Match, 63)], score: 63, n_mismatch: 0, }; @@ -622,7 +594,7 @@ mod tests { is_reverse: false, read_start: 63, read_end: 100, - cigar: vec![CigarOp::Match(37)], + cigar: vec![Op::new(Kind::Match, 37)], score: 37, n_mismatch: 0, }; diff --git a/src/chimeric/segment.rs b/src/chimeric/segment.rs index 1f48e75..78c5ecc 100644 --- a/src/chimeric/segment.rs +++ b/src/chimeric/segment.rs @@ -1,6 +1,7 @@ -// ChimericSegment and ChimericAlignment data structures +//! ChimericSegment and ChimericAlignment data structures -use crate::align::transcript::CigarOp; +use crate::align::transcript::cigar_to_string; +use noodles::sam::alignment::record::cigar; /// A single segment of a chimeric alignment #[derive(Debug, Clone)] @@ -11,7 +12,7 @@ pub struct ChimericSegment { pub is_reverse: bool, pub read_start: usize, pub read_end: usize, - pub cigar: Vec, + pub cigar: Vec, pub score: i32, pub n_mismatch: u32, } @@ -31,6 +32,11 @@ impl ChimericSegment { pub fn meets_min_length(&self, min_len: u32) -> bool { self.read_length() >= min_len as usize } + + /// Format CIGAR string + pub fn cigar_string(&self) -> String { + cigar_to_string(&self.cigar) + } } /// A chimeric alignment consisting of two segments @@ -112,6 +118,7 @@ impl ChimericAlignment { #[cfg(test)] mod tests { use super::*; + use cigar::op::{Kind, Op}; fn mock_segment( chr_idx: usize, @@ -127,7 +134,7 @@ mod tests { is_reverse: false, read_start, read_end, - cigar: vec![CigarOp::Match(50)], + cigar: vec![Op::new(Kind::Match, 50)], score: 100, n_mismatch: 2, } diff --git a/src/io/bam.rs b/src/io/bam.rs index 2ac0517..9a674ec 100644 --- a/src/io/bam.rs +++ b/src/io/bam.rs @@ -461,8 +461,9 @@ impl SortedBamStdoutWriter { #[cfg(test)] mod tests { use super::*; - use crate::align::transcript::{CigarOp, Exon, Transcript}; + use crate::align::transcript::{Exon, Transcript}; use clap::Parser; + use noodles::sam::alignment::record::cigar; use tempfile::NamedTempFile; fn create_test_genome() -> Genome { @@ -518,6 +519,8 @@ mod tests { #[test] fn test_bam_alignment_write() { + use cigar::op::{Kind, Op}; + let genome = create_test_genome(); let params = create_test_params(); let temp_file = NamedTempFile::new().unwrap(); @@ -537,7 +540,7 @@ mod tests { read_end: 4, i_frag: 0, }], - cigar: vec![CigarOp::Match(4)], + cigar: vec![Op::new(Kind::Match, 4)], score: 0, n_mismatch: 0, n_gap: 0, diff --git a/src/io/sam.rs b/src/io/sam.rs index ee91a43..9ae7611 100644 --- a/src/io/sam.rs +++ b/src/io/sam.rs @@ -1,6 +1,6 @@ -/// SAM/BAM output writer with noodles +//! SAM/BAM output writer with noodles use crate::align::read_align::PairedAlignment; -use crate::align::transcript::{CigarOp, Transcript}; +use crate::align::transcript::{Transcript, cigar_to_string}; use crate::error::Error; use crate::genome::Genome; use crate::io::fastq::{complement_base, decode_base}; @@ -11,6 +11,7 @@ use bstr::BString; use noodles::sam; use noodles::sam::alignment::io::Write; use noodles::sam::alignment::record::MappingQuality; +use noodles::sam::alignment::record::cigar; use noodles::sam::alignment::record::data::field::Tag; use noodles::sam::alignment::record_buf::data::field::Value; use noodles::sam::alignment::record_buf::data::field::value::Array; @@ -149,17 +150,13 @@ impl SamWriter { .name() .map(|n| String::from_utf8_lossy(n.as_ref()).to_string()) .unwrap_or_default(); - let cigar_str: String = cigar_ops - .iter() - .map(|op| format!("{}{:?}", op.len(), op.kind())) - .collect::(); panic!( "[SAM-MISMATCH] read={} cigar_query_len={} seq_len={} flags={:?} cigar={}", name, cigar_query_len, seq_len, record.flags(), - cigar_str + cigar_to_string(cigar_ops) ); } self.writer.write_alignment_record(&self.header, record)?; @@ -424,7 +421,7 @@ impl SamWriter { Error::Alignment(format!("invalid position {}: {}", mapped_pos, e)) })?); *mapped_rec.mapping_quality_mut() = MappingQuality::new(mapq); - *mapped_rec.cigar_mut() = convert_cigar(&mapped_transcript.cigar)?; + *mapped_rec.cigar_mut() = mapped_transcript.cigar.iter().copied().collect(); // RNEXT = own chr, PNEXT = own pos (STAR convention for unmapped mate) *mapped_rec.mate_reference_sequence_id_mut() = Some(mapped_transcript.chr_idx); @@ -614,7 +611,7 @@ impl SamWriter { *record.mapping_quality_mut() = MappingQuality::new(mapq); // CIGAR (already has N ops stripped by align_to_transcripts) - *record.cigar_mut() = convert_cigar(&t.cigar)?; + *record.cigar_mut() = t.cigar.iter().copied().collect(); // SEQ / QUAL — STAR writes the original-orientation sequence when // FLAG 0x10 is unset (forward alignment in t-space) and RC'd seq @@ -910,8 +907,7 @@ fn transcript_to_record( *record.mapping_quality_mut() = MappingQuality::new(mapq); // CIGAR - let cigar = convert_cigar(&transcript.cigar)?; - *record.cigar_mut() = cigar; + *record.cigar_mut() = transcript.cigar.iter().copied().collect(); // Sequence and quality scores // Per SAM spec: when FLAG & 16 (reverse strand), SEQ is the reverse complement @@ -980,15 +976,16 @@ fn transcript_to_record( /// Not emitted in SAM output (STAR maps NM attribute to 'nM' tag), but kept for tests. #[cfg(test)] fn compute_edit_distance(transcript: &Transcript) -> i32 { - let indel_bases: u32 = transcript + use cigar::op::Kind; + let indel_bases: usize = transcript .cigar .iter() - .filter_map(|op| match op { - CigarOp::Ins(n) | CigarOp::Del(n) => Some(*n), + .filter_map(|op| match op.kind() { + Kind::Insertion | Kind::Deletion => Some(op.len()), _ => None, }) .sum(); - (transcript.n_mismatch + indel_bases) as i32 + (transcript.n_mismatch + indel_bases as u32) as i32 } /// Derive XS strand tag from transcript junction motifs. @@ -1038,22 +1035,23 @@ fn build_jm_tag(transcript: &Transcript) -> Option { /// Format: [start1, end1, start2, end2, ...] where start is first intronic base /// and end is last intronic base (both 1-based, inclusive). fn build_ji_tag(transcript: &Transcript, chr_start: u64) -> Option { + use cigar::op::Kind; if transcript.n_junction == 0 { return None; } let mut coords: Vec = Vec::new(); let mut genome_pos = transcript.genome_start; for op in &transcript.cigar { - match op { - CigarOp::RefSkip(n) => { + match op.kind() { + Kind::Skip => { let intron_start = (genome_pos - chr_start + 1) as i32; // 1-based - let intron_end = (genome_pos + *n as u64 - chr_start) as i32; // 1-based inclusive + let intron_end = (genome_pos + op.len() as u64 - chr_start) as i32; // 1-based inclusive coords.push(intron_start); coords.push(intron_end); - genome_pos += *n as u64; + genome_pos += op.len() as u64; } - CigarOp::Match(n) | CigarOp::Equal(n) | CigarOp::Diff(n) | CigarOp::Del(n) => { - genome_pos += *n as u64; + Kind::Match | Kind::SequenceMatch | Kind::SequenceMismatch | Kind::Deletion => { + genome_pos += op.len() as u64; } _ => {} // Ins, SoftClip, HardClip don't consume reference } @@ -1071,6 +1069,8 @@ fn build_md_tag( genome: &Genome, is_reverse: bool, ) -> String { + use cigar::op::Kind; + // Build the SAM-order sequence (RC for reverse strand) let sam_seq: Vec = if is_reverse { read_seq.iter().rev().map(|&b| complement_base(b)).collect() @@ -1084,9 +1084,9 @@ fn build_md_tag( let mut read_pos: usize = 0; for op in &transcript.cigar { - match op { - CigarOp::Match(n) | CigarOp::Equal(n) | CigarOp::Diff(n) => { - for _ in 0..*n { + match op.kind() { + Kind::Match | Kind::SequenceMatch | Kind::SequenceMismatch => { + for _ in 0..op.len() { let ref_base = genome.get_base(genome_pos).unwrap_or(4); let read_base = if read_pos < sam_seq.len() { sam_seq[read_pos] @@ -1104,26 +1104,23 @@ fn build_md_tag( read_pos += 1; } } - CigarOp::Del(n) => { + Kind::Deletion => { write!(md, "{}", match_count).unwrap(); match_count = 0; md.push('^'); - for _ in 0..*n { + for _ in 0..op.len() { let ref_base = genome.get_base(genome_pos).unwrap_or(4); md.push(decode_base(ref_base) as char); genome_pos += 1; } } - CigarOp::Ins(n) => { - read_pos += *n as usize; - } - CigarOp::RefSkip(n) => { - genome_pos += *n as u64; + Kind::Insertion | Kind::SoftClip => { + read_pos += op.len(); } - CigarOp::SoftClip(n) => { - read_pos += *n as usize; + Kind::Skip => { + genome_pos += op.len() as u64; } - CigarOp::HardClip(_) => {} + Kind::HardClip | Kind::Pad => {} } } // Emit trailing match count @@ -1131,32 +1128,6 @@ fn build_md_tag( md } -/// Convert rustar-aligner CigarOp to noodles Cigar -pub(crate) fn convert_cigar(ops: &[CigarOp]) -> Result { - use sam::alignment::record::cigar::op::Kind; - - let mut cigar = sam::alignment::record_buf::Cigar::default(); - - for op in ops { - let kind = match op { - CigarOp::Match(_) => Kind::Match, - CigarOp::Equal(_) => Kind::SequenceMatch, - CigarOp::Diff(_) => Kind::SequenceMismatch, - CigarOp::Ins(_) => Kind::Insertion, - CigarOp::Del(_) => Kind::Deletion, - CigarOp::RefSkip(_) => Kind::Skip, - CigarOp::SoftClip(_) => Kind::SoftClip, - CigarOp::HardClip(_) => Kind::HardClip, - }; - - let len = op.len() as usize; - let noodles_op = sam::alignment::record::cigar::Op::new(kind, len); - cigar.as_mut().push(noodles_op); - } - - Ok(cigar) -} - /// Build a SAM record for one mate of a paired-end read #[allow(clippy::too_many_arguments)] fn build_paired_mate_record( @@ -1231,8 +1202,7 @@ fn build_paired_mate_record( *record.mapping_quality_mut() = MappingQuality::new(mapq); // CIGAR - let cigar = convert_cigar(&transcript.cigar)?; - *record.cigar_mut() = cigar; + *record.cigar_mut() = transcript.cigar.iter().copied().collect(); // RNEXT (mate reference sequence from the mate's actual alignment) *record.mate_reference_sequence_id_mut() = Some(mate_transcript.chr_idx); @@ -1315,6 +1285,7 @@ mod tests { use crate::align::score::SpliceMotif; use crate::genome::Genome; use clap::Parser; + use noodles::sam::alignment::record::cigar; use tempfile::NamedTempFile; /// Build an attribute set with all tags enabled (for tests that don't care about filtering) @@ -1344,20 +1315,6 @@ mod tests { } } - #[test] - fn test_convert_cigar() { - let ops = vec![ - CigarOp::Match(50), - CigarOp::Ins(3), - CigarOp::Del(2), - CigarOp::RefSkip(100), - CigarOp::Match(50), - ]; - - let cigar = convert_cigar(&ops).unwrap(); - assert_eq!(cigar.as_ref().len(), 5); - } - #[test] fn test_build_sam_header() { let genome = make_test_genome(); @@ -1401,6 +1358,7 @@ mod tests { #[test] fn test_sam_output_includes_rg_header_and_tag() { use crate::align::transcript::Exon; + use cigar::op::{Kind, Op}; use std::io::Read; let genome = make_test_genome(); @@ -1428,7 +1386,7 @@ mod tests { read_end: 4, i_frag: 0, }], - cigar: vec![CigarOp::Match(4)], + cigar: vec![Op::new(Kind::Match, 4)], score: 100, n_mismatch: 0, n_gap: 0, @@ -1478,6 +1436,7 @@ mod tests { #[test] fn test_transcript_to_record() { + use cigar::op::{Kind, Op}; let genome = make_test_genome(); let transcript = Transcript { @@ -1486,7 +1445,7 @@ mod tests { genome_end: 60, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(50)], + cigar: vec![Op::new(Kind::Match, 50)], score: 100, n_mismatch: 2, n_gap: 0, @@ -1569,6 +1528,7 @@ mod tests { #[test] fn test_build_paired_mate_record_flags() { use crate::align::transcript::Exon; + use cigar::op::{Kind, Op}; let genome = make_test_genome(); @@ -1584,7 +1544,7 @@ mod tests { read_end: 4, i_frag: 0, }], - cigar: vec![CigarOp::Match(4)], + cigar: vec![Op::new(Kind::Match, 4)], score: 100, n_mismatch: 0, n_gap: 0, @@ -1606,7 +1566,7 @@ mod tests { read_end: 3, i_frag: 0, }], - cigar: vec![CigarOp::Match(3)], + cigar: vec![Op::new(Kind::Match, 3)], score: 90, n_mismatch: 1, n_gap: 0, @@ -1679,6 +1639,7 @@ mod tests { #[test] fn test_build_paired_mate_record_mate_fields() { use crate::align::transcript::Exon; + use cigar::op::{Kind, Op}; let genome = make_test_genome(); @@ -1695,7 +1656,7 @@ mod tests { read_end: 4, i_frag: 0, }], - cigar: vec![CigarOp::Match(4)], + cigar: vec![Op::new(Kind::Match, 4)], score: 200, n_mismatch: 0, n_gap: 0, @@ -1718,7 +1679,7 @@ mod tests { read_end: 3, i_frag: 0, }], - cigar: vec![CigarOp::Match(3)], + cigar: vec![Op::new(Kind::Match, 3)], score: 150, n_mismatch: 1, n_gap: 0, @@ -1774,6 +1735,7 @@ mod tests { #[test] fn test_tags_nh_hi_as_nm() { + use cigar::op::{Kind, Op}; let genome = make_test_genome(); // Transcript with 2 mismatches and a 3bp deletion → NM = 2 + 3 = 5 @@ -1783,7 +1745,11 @@ mod tests { genome_end: 60, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(20), CigarOp::Del(3), CigarOp::Match(30)], + cigar: vec![ + Op::new(Kind::Match, 20), + Op::new(Kind::Deletion, 3), + Op::new(Kind::Match, 30), + ], score: 100, n_mismatch: 2, n_gap: 1, @@ -1843,6 +1809,7 @@ mod tests { #[test] fn test_edit_distance_computation() { + use cigar::op::{Kind, Op}; // Pure match: NM = n_mismatch only let t1 = Transcript { chr_idx: 0, @@ -1850,7 +1817,7 @@ mod tests { genome_end: 50, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(50)], + cigar: vec![Op::new(Kind::Match, 50)], score: 100, n_mismatch: 3, n_gap: 0, @@ -1869,11 +1836,11 @@ mod tests { is_reverse: false, exons: vec![], cigar: vec![ - CigarOp::Match(20), - CigarOp::Ins(5), - CigarOp::Match(10), - CigarOp::Del(7), - CigarOp::Match(20), + Op::new(Kind::Match, 20), + Op::new(Kind::Insertion, 5), + Op::new(Kind::Match, 10), + Op::new(Kind::Deletion, 7), + Op::new(Kind::Match, 20), ], score: 80, n_mismatch: 1, @@ -1893,9 +1860,9 @@ mod tests { is_reverse: false, exons: vec![], cigar: vec![ - CigarOp::Match(25), - CigarOp::RefSkip(1000), - CigarOp::Match(25), + Op::new(Kind::Match, 25), + Op::new(Kind::Skip, 1000), + Op::new(Kind::Match, 25), ], score: 50, n_mismatch: 0, @@ -1915,9 +1882,9 @@ mod tests { is_reverse: false, exons: vec![], cigar: vec![ - CigarOp::SoftClip(10), - CigarOp::Match(40), - CigarOp::SoftClip(10), + Op::new(Kind::SoftClip, 10), + Op::new(Kind::Match, 40), + Op::new(Kind::SoftClip, 10), ], score: 40, n_mismatch: 2, @@ -1932,6 +1899,7 @@ mod tests { #[test] fn test_transcript_to_record_has_tags() { + use cigar::op::{Kind, Op}; // Verify the existing test_transcript_to_record scenario also has tags let genome = make_test_genome(); @@ -1941,7 +1909,7 @@ mod tests { genome_end: 60, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(50)], + cigar: vec![Op::new(Kind::Match, 50)], score: 100, n_mismatch: 2, n_gap: 0, @@ -1985,6 +1953,7 @@ mod tests { #[test] fn test_secondary_flag() { + use cigar::op::{Kind, Op}; let genome = make_test_genome(); let params = Parameters::parse_from(vec!["rustar-aligner", "--readFilesIn", "test.fq"]); @@ -1995,7 +1964,7 @@ mod tests { genome_end: 50, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(50)], + cigar: vec![Op::new(Kind::Match, 50)], score: 100, n_mismatch: 0, n_gap: 0, @@ -2010,7 +1979,7 @@ mod tests { genome_end: 52, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(50)], + cigar: vec![Op::new(Kind::Match, 50)], score: 98, n_mismatch: 1, n_gap: 0, @@ -2025,7 +1994,7 @@ mod tests { genome_end: 54, is_reverse: true, exons: vec![], - cigar: vec![CigarOp::Match(50)], + cigar: vec![Op::new(Kind::Match, 50)], score: 96, n_mismatch: 2, n_gap: 0, @@ -2063,6 +2032,7 @@ mod tests { #[test] fn test_xs_tag_spliced() { + use cigar::op::{Kind, Op}; let genome = make_test_genome(); let transcript = Transcript { @@ -2072,9 +2042,9 @@ mod tests { is_reverse: false, exons: vec![], cigar: vec![ - CigarOp::Match(25), - CigarOp::RefSkip(100), - CigarOp::Match(25), + Op::new(Kind::Match, 25), + Op::new(Kind::Skip, 100), + Op::new(Kind::Match, 25), ], score: 50, n_mismatch: 0, @@ -2111,6 +2081,7 @@ mod tests { #[test] fn test_xs_tag_unspliced() { + use cigar::op::{Kind, Op}; let genome = make_test_genome(); let transcript = Transcript { @@ -2119,7 +2090,7 @@ mod tests { genome_end: 50, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(50)], + cigar: vec![Op::new(Kind::Match, 50)], score: 100, n_mismatch: 0, n_gap: 0, @@ -2155,6 +2126,7 @@ mod tests { #[test] fn test_xs_tag_reverse_strand() { + use cigar::op::{Kind, Op}; let genome = make_test_genome(); let transcript = Transcript { @@ -2164,9 +2136,9 @@ mod tests { is_reverse: false, exons: vec![], cigar: vec![ - CigarOp::Match(25), - CigarOp::RefSkip(100), - CigarOp::Match(25), + Op::new(Kind::Match, 25), + Op::new(Kind::Skip, 100), + Op::new(Kind::Match, 25), ], score: 50, n_mismatch: 0, @@ -2203,6 +2175,7 @@ mod tests { #[test] fn test_xs_tag_conflicting_motifs() { + use cigar::op::{Kind, Op}; let genome = make_test_genome(); let transcript = Transcript { @@ -2212,11 +2185,11 @@ mod tests { is_reverse: false, exons: vec![], cigar: vec![ - CigarOp::Match(25), - CigarOp::RefSkip(100), - CigarOp::Match(25), - CigarOp::RefSkip(100), - CigarOp::Match(25), + Op::new(Kind::Match, 25), + Op::new(Kind::Skip, 100), + Op::new(Kind::Match, 25), + Op::new(Kind::Skip, 100), + Op::new(Kind::Match, 25), ], score: 50, n_mismatch: 0, @@ -2253,6 +2226,7 @@ mod tests { #[test] fn test_xs_not_emitted_when_disabled() { + use cigar::op::{Kind, Op}; let genome = make_test_genome(); let transcript = Transcript { @@ -2262,9 +2236,9 @@ mod tests { is_reverse: false, exons: vec![], cigar: vec![ - CigarOp::Match(25), - CigarOp::RefSkip(100), - CigarOp::Match(25), + Op::new(Kind::Match, 25), + Op::new(Kind::Skip, 100), + Op::new(Kind::Match, 25), ], score: 50, n_mismatch: 0, @@ -2301,6 +2275,7 @@ mod tests { #[test] fn test_out_sam_mult_nmax() { + use cigar::op::{Kind, Op}; let genome = make_test_genome(); let params = Parameters::parse_from(vec![ "rustar-aligner", @@ -2317,7 +2292,7 @@ mod tests { genome_end: (i + 50) as u64, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(50)], + cigar: vec![Op::new(Kind::Match, 50)], score: 100 - i, n_mismatch: 0, n_gap: 0, @@ -2377,6 +2352,7 @@ mod tests { #[test] fn test_build_jm_tag_basic() { + use cigar::op::{Kind, Op}; let transcript = Transcript { chr_idx: 0, genome_start: 0, @@ -2384,9 +2360,9 @@ mod tests { is_reverse: false, exons: vec![], cigar: vec![ - CigarOp::Match(25), - CigarOp::RefSkip(100), - CigarOp::Match(25), + Op::new(Kind::Match, 25), + Op::new(Kind::Skip, 100), + Op::new(Kind::Match, 25), ], score: 50, n_mismatch: 0, @@ -2405,6 +2381,7 @@ mod tests { #[test] fn test_build_jm_tag_annotated() { + use cigar::op::{Kind, Op}; let transcript = Transcript { chr_idx: 0, genome_start: 0, @@ -2412,9 +2389,9 @@ mod tests { is_reverse: false, exons: vec![], cigar: vec![ - CigarOp::Match(25), - CigarOp::RefSkip(100), - CigarOp::Match(25), + Op::new(Kind::Match, 25), + Op::new(Kind::Skip, 100), + Op::new(Kind::Match, 25), ], score: 50, n_mismatch: 0, @@ -2433,13 +2410,14 @@ mod tests { #[test] fn test_build_jm_tag_empty() { + use cigar::op::{Kind, Op}; let transcript = Transcript { chr_idx: 0, genome_start: 0, genome_end: 50, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(50)], + cigar: vec![Op::new(Kind::Match, 50)], score: 50, n_mismatch: 0, n_gap: 0, @@ -2454,6 +2432,7 @@ mod tests { #[test] fn test_build_jm_tag_multiple_junctions() { + use cigar::op::{Kind, Op}; let transcript = Transcript { chr_idx: 0, genome_start: 0, @@ -2461,11 +2440,11 @@ mod tests { is_reverse: false, exons: vec![], cigar: vec![ - CigarOp::Match(25), - CigarOp::RefSkip(100), - CigarOp::Match(25), - CigarOp::RefSkip(100), - CigarOp::Match(25), + Op::new(Kind::Match, 25), + Op::new(Kind::Skip, 100), + Op::new(Kind::Match, 25), + Op::new(Kind::Skip, 100), + Op::new(Kind::Match, 25), ], score: 50, n_mismatch: 0, @@ -2484,6 +2463,7 @@ mod tests { #[test] fn test_build_ji_tag_basic() { + use cigar::op::{Kind, Op}; let transcript = Transcript { chr_idx: 0, genome_start: 100, @@ -2491,9 +2471,9 @@ mod tests { is_reverse: false, exons: vec![], cigar: vec![ - CigarOp::Match(25), - CigarOp::RefSkip(200), - CigarOp::Match(25), + Op::new(Kind::Match, 25), + Op::new(Kind::Skip, 200), + Op::new(Kind::Match, 25), ], score: 50, n_mismatch: 0, @@ -2513,13 +2493,14 @@ mod tests { #[test] fn test_build_ji_tag_empty() { + use cigar::op::{Kind, Op}; let transcript = Transcript { chr_idx: 0, genome_start: 0, genome_end: 50, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(50)], + cigar: vec![Op::new(Kind::Match, 50)], score: 50, n_mismatch: 0, n_gap: 0, @@ -2534,6 +2515,7 @@ mod tests { #[test] fn test_build_md_tag_perfect_match() { + use cigar::op::{Kind, Op}; // Genome: ACGTACGT (A=0,C=1,G=2,T=3) let genome = make_test_genome(); let transcript = Transcript { @@ -2542,7 +2524,7 @@ mod tests { genome_end: 4, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(4)], + cigar: vec![Op::new(Kind::Match, 4)], score: 4, n_mismatch: 0, n_gap: 0, @@ -2560,6 +2542,7 @@ mod tests { #[test] fn test_build_md_tag_mismatches() { + use cigar::op::{Kind, Op}; // Genome: ACGTACGT let genome = make_test_genome(); let transcript = Transcript { @@ -2568,7 +2551,7 @@ mod tests { genome_end: 4, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(4)], + cigar: vec![Op::new(Kind::Match, 4)], score: 2, n_mismatch: 2, n_gap: 0, @@ -2587,6 +2570,7 @@ mod tests { #[test] fn test_build_md_tag_deletion() { + use cigar::op::{Kind, Op}; // Genome: ACGTACGT let genome = make_test_genome(); let transcript = Transcript { @@ -2595,7 +2579,11 @@ mod tests { genome_end: 6, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(2), CigarOp::Del(2), CigarOp::Match(2)], + cigar: vec![ + Op::new(Kind::Match, 2), + Op::new(Kind::Deletion, 2), + Op::new(Kind::Match, 2), + ], score: 4, n_mismatch: 0, n_gap: 1, @@ -2613,6 +2601,7 @@ mod tests { #[test] fn test_build_md_tag_insertion() { + use cigar::op::{Kind, Op}; // Genome: ACGTACGT let genome = make_test_genome(); let transcript = Transcript { @@ -2621,7 +2610,11 @@ mod tests { genome_end: 4, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(2), CigarOp::Ins(2), CigarOp::Match(2)], + cigar: vec![ + Op::new(Kind::Match, 2), + Op::new(Kind::Insertion, 2), + Op::new(Kind::Match, 2), + ], score: 4, n_mismatch: 0, n_gap: 1, @@ -2639,6 +2632,7 @@ mod tests { #[test] fn test_build_md_tag_soft_clip() { + use cigar::op::{Kind, Op}; // Genome: ACGTACGT let genome = make_test_genome(); let transcript = Transcript { @@ -2648,9 +2642,9 @@ mod tests { is_reverse: false, exons: vec![], cigar: vec![ - CigarOp::SoftClip(2), - CigarOp::Match(4), - CigarOp::SoftClip(2), + Op::new(Kind::SoftClip, 2), + Op::new(Kind::Match, 4), + Op::new(Kind::SoftClip, 2), ], score: 4, n_mismatch: 0, @@ -2669,6 +2663,7 @@ mod tests { #[test] fn test_tags_jm_ji_md_in_record() { + use cigar::op::{Kind, Op}; // Verify tags appear in a full transcript_to_record call let genome = make_test_genome(); @@ -2678,7 +2673,7 @@ mod tests { genome_end: 4, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(4)], + cigar: vec![Op::new(Kind::Match, 4)], score: 4, n_mismatch: 0, n_gap: 0, @@ -2718,6 +2713,7 @@ mod tests { #[test] fn test_build_paired_mate_record_cross_strand() { use crate::align::transcript::Exon; + use cigar::op::{Kind, Op}; let genome = make_test_genome(); @@ -2734,7 +2730,7 @@ mod tests { read_end: 4, i_frag: 0, }], - cigar: vec![CigarOp::Match(4)], + cigar: vec![Op::new(Kind::Match, 4)], score: 100, n_mismatch: 0, n_gap: 0, @@ -2757,7 +2753,7 @@ mod tests { read_end: 3, i_frag: 0, }], - cigar: vec![CigarOp::Match(3)], + cigar: vec![Op::new(Kind::Match, 3)], score: 90, n_mismatch: 0, n_gap: 0, @@ -2820,6 +2816,7 @@ mod tests { #[test] fn test_build_paired_mate_record_per_mate_tags() { use crate::align::transcript::Exon; + use cigar::op::{Kind, Op}; let genome = make_test_genome(); @@ -2836,7 +2833,7 @@ mod tests { read_end: 4, i_frag: 0, }], - cigar: vec![CigarOp::Match(4)], + cigar: vec![Op::new(Kind::Match, 4)], score: 100, n_mismatch: 0, n_gap: 0, @@ -2859,7 +2856,11 @@ mod tests { read_end: 3, i_frag: 0, }], - cigar: vec![CigarOp::Match(2), CigarOp::Del(1), CigarOp::Match(1)], + cigar: vec![ + Op::new(Kind::Match, 2), + Op::new(Kind::Deletion, 1), + Op::new(Kind::Match, 1), + ], score: 80, n_mismatch: 2, n_gap: 1, @@ -2938,6 +2939,7 @@ mod tests { #[test] fn test_build_paired_mate_record_both_forward() { use crate::align::transcript::Exon; + use cigar::op::{Kind, Op}; let genome = make_test_genome(); @@ -2954,7 +2956,7 @@ mod tests { read_end: 4, i_frag: 0, }], - cigar: vec![CigarOp::Match(4)], + cigar: vec![Op::new(Kind::Match, 4)], score: 100, n_mismatch: 0, n_gap: 0, @@ -2976,7 +2978,7 @@ mod tests { read_end: 3, i_frag: 0, }], - cigar: vec![CigarOp::Match(3)], + cigar: vec![Op::new(Kind::Match, 3)], score: 90, n_mismatch: 0, n_gap: 0, @@ -3033,6 +3035,7 @@ mod tests { #[test] fn test_out_sam_attributes_standard() { + use cigar::op::{Kind, Op}; // Default (Standard) → NH, HI, AS, NM present; XS, jM, jI, MD absent let genome = make_test_genome(); @@ -3042,7 +3045,7 @@ mod tests { genome_end: 4, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(4)], + cigar: vec![Op::new(Kind::Match, 4)], score: 100, n_mismatch: 1, n_gap: 0, @@ -3108,6 +3111,7 @@ mod tests { #[test] fn test_out_sam_attributes_none() { + use cigar::op::{Kind, Op}; // None → no optional tags at all let genome = make_test_genome(); @@ -3117,7 +3121,7 @@ mod tests { genome_end: 4, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(4)], + cigar: vec![Op::new(Kind::Match, 4)], score: 100, n_mismatch: 1, n_gap: 0, @@ -3182,6 +3186,7 @@ mod tests { #[test] fn test_out_sam_attributes_explicit() { + use cigar::op::{Kind, Op}; // Explicit ["NH", "MD"] → only NH and MD present let genome = make_test_genome(); @@ -3191,7 +3196,7 @@ mod tests { genome_end: 4, is_reverse: false, exons: vec![], - cigar: vec![CigarOp::Match(4)], + cigar: vec![Op::new(Kind::Match, 4)], score: 100, n_mismatch: 0, n_gap: 0, @@ -3317,6 +3322,7 @@ mod tests { #[test] fn test_nm_vs_nm_mismatch_difference() { + use cigar::op::{Kind, Op}; // Verify NM (edit distance) ≠ nM (mismatches only) when indels present let genome = make_test_genome(); @@ -3327,11 +3333,11 @@ mod tests { is_reverse: false, exons: vec![], cigar: vec![ - CigarOp::Match(20), - CigarOp::Ins(5), - CigarOp::Match(10), - CigarOp::Del(3), - CigarOp::Match(20), + Op::new(Kind::Match, 20), + Op::new(Kind::Insertion, 5), + Op::new(Kind::Match, 10), + Op::new(Kind::Deletion, 3), + Op::new(Kind::Match, 20), ], score: 80, n_mismatch: 2, @@ -3376,6 +3382,7 @@ mod tests { #[test] fn test_build_half_mapped_flags() { use crate::align::transcript::Exon; + use cigar::op::{Kind, Op}; let genome = make_test_genome(); let params = @@ -3393,7 +3400,7 @@ mod tests { read_end: 4, i_frag: 0, }], - cigar: vec![CigarOp::Match(4)], + cigar: vec![Op::new(Kind::Match, 4)], score: 100, n_mismatch: 0, n_gap: 0, @@ -3445,6 +3452,7 @@ mod tests { #[test] fn test_build_half_mapped_rnext_pnext() { use crate::align::transcript::Exon; + use cigar::op::{Kind, Op}; let genome = make_test_genome(); let params = @@ -3462,7 +3470,7 @@ mod tests { read_end: 4, i_frag: 0, }], - cigar: vec![CigarOp::Match(4)], + cigar: vec![Op::new(Kind::Match, 4)], score: 100, n_mismatch: 0, n_gap: 0, @@ -3527,6 +3535,7 @@ mod tests { #[test] fn test_build_half_mapped_mate_order() { use crate::align::transcript::Exon; + use cigar::op::{Kind, Op}; let genome = make_test_genome(); let params = @@ -3544,7 +3553,7 @@ mod tests { read_end: 4, i_frag: 0, }], - cigar: vec![CigarOp::Match(4)], + cigar: vec![Op::new(Kind::Match, 4)], score: 100, n_mismatch: 0, n_gap: 0, diff --git a/src/lib.rs b/src/lib.rs index 09472fa..cda64a0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,10 +7,7 @@ clippy::cast_precision_loss, clippy::cast_sign_loss, clippy::doc_markdown, - clippy::format_collect, - clippy::format_push_string, clippy::items_after_statements, - clippy::match_same_arms, clippy::missing_errors_doc, clippy::missing_panics_doc, clippy::must_use_candidate, @@ -43,6 +40,7 @@ pub mod quant; pub mod stats; use log::info; +use noodles::sam::alignment::record::cigar; use crate::params::{Parameters, RunMode}; @@ -811,20 +809,21 @@ fn extract_junction_keys( index: &crate::index::GenomeIndex, ) -> Vec { use crate::align::score::AlignmentScorer; - use crate::align::transcript::CigarOp; + use cigar::op::Kind; let scorer = AlignmentScorer::from_params_minimal(); let mut keys = Vec::new(); let mut genome_pos = transcript.genome_start; for op in &transcript.cigar { - match op { - CigarOp::RefSkip(len) => { - let intron_len = *len; + match op.kind() { + Kind::Skip => { + let intron_len = op.len(); let intron_start = genome_pos + 1; let intron_end = genome_pos + intron_len as u64; - let motif = scorer.detect_splice_motif(genome_pos, intron_len, &index.genome); + let motif = + scorer.detect_splice_motif(genome_pos, intron_len as u32, &index.genome); let strand = match motif.implied_strand() { Some('+') => 1u8, Some('-') => 2u8, @@ -842,13 +841,10 @@ fn extract_junction_keys( genome_pos += intron_len as u64; } - CigarOp::Match(len) | CigarOp::Equal(len) | CigarOp::Diff(len) => { - genome_pos += *len as u64; + Kind::Match | Kind::SequenceMatch | Kind::SequenceMismatch | Kind::Deletion => { + genome_pos += op.len() as u64; } - CigarOp::Del(len) => { - genome_pos += *len as u64; - } - CigarOp::Ins(_) | CigarOp::SoftClip(_) | CigarOp::HardClip(_) => {} + Kind::Insertion | Kind::SoftClip | Kind::HardClip | Kind::Pad => {} } } @@ -1848,7 +1844,7 @@ fn record_transcript_junctions( is_unique: bool, ) { use crate::align::score::AlignmentScorer; - use crate::align::transcript::CigarOp; + use cigar::op::Kind; // First pass: compute exon segment lengths (query-consuming bases between N operations) // An "exon segment" is the query bases on each side of a splice junction. @@ -1856,20 +1852,17 @@ fn record_transcript_junctions( let mut current_exon_len = 0u32; for op in &transcript.cigar { - match op { - CigarOp::Match(len) | CigarOp::Equal(len) | CigarOp::Diff(len) => { - current_exon_len += *len; - } - CigarOp::Ins(len) => { - current_exon_len += *len; + match op.kind() { + Kind::Match | Kind::SequenceMatch | Kind::SequenceMismatch | Kind::Insertion => { + current_exon_len += op.len() as u32; } - CigarOp::RefSkip(_) => { + Kind::Skip => { exon_lengths.push(current_exon_len); current_exon_len = 0; } // Soft clips, deletions, hard clips do not contribute to overhang // STAR counts only matched/inserted bases (not soft-clipped bases) - CigarOp::SoftClip(_) | CigarOp::Del(_) | CigarOp::HardClip(_) => {} + Kind::Deletion | Kind::SoftClip | Kind::HardClip | Kind::Pad => {} } } exon_lengths.push(current_exon_len); // Final exon segment @@ -1881,15 +1874,16 @@ fn record_transcript_junctions( let scorer = AlignmentScorer::from_params_minimal(); for op in &transcript.cigar { - match op { - CigarOp::RefSkip(len) => { + match op.kind() { + Kind::Skip => { // This is a splice junction - let intron_len = *len; + let intron_len = op.len(); let intron_start = genome_pos + 1; // 1-based, first intronic base let intron_end = genome_pos + intron_len as u64; // 1-based, last intronic base // Detect splice motif - let motif = scorer.detect_splice_motif(genome_pos, intron_len, &index.genome); + let motif = + scorer.detect_splice_motif(genome_pos, intron_len as u32, &index.genome); // Compute overhang: min(left_exon_length, right_exon_length) let left_exon = exon_lengths[junction_idx]; @@ -1925,14 +1919,10 @@ fn record_transcript_junctions( genome_pos += intron_len as u64; junction_idx += 1; } - CigarOp::Match(len) | CigarOp::Equal(len) | CigarOp::Diff(len) => { - genome_pos += *len as u64; - } - CigarOp::Ins(_) => {} - CigarOp::Del(len) => { - genome_pos += *len as u64; + Kind::Match | Kind::SequenceMatch | Kind::SequenceMismatch | Kind::Deletion => { + genome_pos += op.len() as u64; } - CigarOp::SoftClip(_) | CigarOp::HardClip(_) => {} + Kind::Insertion | Kind::SoftClip | Kind::HardClip | Kind::Pad => {} } } } diff --git a/src/mapq.rs b/src/mapq.rs index c1c8c02..d93e2e4 100644 --- a/src/mapq.rs +++ b/src/mapq.rs @@ -15,11 +15,10 @@ /// MAPQ score (0-255) pub fn calculate_mapq(n_alignments: usize, mapq_unique: u8) -> u8 { match n_alignments { - 0 => 0, 1 => mapq_unique, 2 => 3, 3 | 4 => 1, - _ => 0, // >= 5 + _ => 0, // 0 or >= 5 } } diff --git a/src/quant/transcriptome.rs b/src/quant/transcriptome.rs index bd2d83a..bee81d8 100644 --- a/src/quant/transcriptome.rs +++ b/src/quant/transcriptome.rs @@ -10,15 +10,15 @@ //! only substantive divergence that rustar-aligner builds the transcript tables on //! the fly from the input GTF instead of loading persisted //! `transcriptInfo.tab`/`exonInfo.tab` files. -use std::collections::HashMap; -use std::io::Write as _; -use std::path::Path; - -use crate::align::transcript::{CigarOp, Exon, Transcript}; +use crate::align::transcript::{CigarOpExt as _, Exon, Transcript}; use crate::error::Error; use crate::genome::Genome; use crate::junction::gtf::GtfRecord; use crate::params::Parameters; +use noodles::sam::alignment::record::cigar; +use std::collections::HashMap; +use std::io::Write as _; +use std::path::Path; /// GTF attribute names the transcriptome index reads. const GTF_ATTR_TRANSCRIPT_ID: &str = "transcript_id"; @@ -265,9 +265,8 @@ impl TranscriptomeIndex { let total_len = ex_len_cum; let strand_u8 = match strand_char { - '+' => 1u8, '-' => 2u8, - _ => 1u8, // unknown → treat as forward (STAR default) + _ => 1u8, // forward or unknown → treat as forward (STAR default) }; let gene_id = first.attributes.get(gene_tag).cloned().unwrap_or_default(); @@ -1102,10 +1101,10 @@ fn align_to_one_transcript( // Build projected CIGAR: drop N operations (splices collapse in t-space); // for reverse-strand transcripts, reverse the resulting op sequence. - let mut proj_cigar: Vec = align + let mut proj_cigar: Vec = align .cigar .iter() - .filter(|op| !matches!(op, CigarOp::RefSkip(_))) + .filter(|op| op.kind() != cigar::op::Kind::Skip) .copied() .collect(); if tr_strand == 2 { @@ -1172,10 +1171,11 @@ pub fn filter_and_project( } fn has_soft_clip(align: &Transcript) -> bool { + use cigar::op::Kind; align .cigar .iter() - .any(|op| matches!(op, CigarOp::SoftClip(n) if *n > 0)) + .any(|op| op.kind() == Kind::SoftClip && !op.is_empty()) } /// Extend the 5'/3' soft-clips of `align` back into matched bases, counting @@ -1188,7 +1188,7 @@ fn extend_softclips( params: &Parameters, ) -> Option { // Determine left / right clip sizes from the CIGAR. - let (left_clip, right_clip) = align.count_soft_clips(); + let [left_clip, right_clip] = align.count_soft_clips(); let mut n_mm_extra: u32 = 0; @@ -1198,7 +1198,7 @@ fn extend_softclips( if left_clip > 0 && let Some(first) = align.exons.first() { - for b in 1..=left_clip as usize { + for b in 1..=left_clip { if b > first.read_start { break; } @@ -1224,7 +1224,7 @@ fn extend_softclips( if right_clip > 0 && let Some(last) = align.exons.last() { - for b in 0..right_clip as usize { + for b in 0..right_clip { let r_idx = last.read_end + b; let g_idx = (last.genome_end as usize) + b; if r_idx >= read_bases_align_orientation.len() || g_idx >= genome.sequence.len() { @@ -1263,7 +1263,7 @@ fn extend_softclips( if right_clip > 0 && let Some(last) = ext.exons.last_mut() { - last.read_end += right_clip as usize; + last.read_end += right_clip; last.genome_end += right_clip as u64; } @@ -1281,17 +1281,18 @@ fn extend_softclips( /// adjacent `Match` ops. Interior soft-clips (rare — only appear in chimeric /// contexts) are left alone. fn rebuild_cigar_without_softclips( - cigar: &[CigarOp], - left_clip: u32, - right_clip: u32, -) -> Vec { - let mut out: Vec = Vec::with_capacity(cigar.len()); + cigar: &[cigar::Op], + left_clip: usize, + right_clip: usize, +) -> Vec { + use cigar::op::{Kind, Op}; + let mut out: Vec = Vec::with_capacity(cigar.len()); let mut start_idx = 0; let mut end_idx = cigar.len(); - if left_clip > 0 && matches!(cigar.first(), Some(CigarOp::SoftClip(_))) { + if left_clip > 0 && cigar.first().is_some_and(|op| op.kind() == Kind::SoftClip) { start_idx = 1; } - if right_clip > 0 && end_idx > start_idx && matches!(cigar[end_idx - 1], CigarOp::SoftClip(_)) { + if right_clip > 0 && end_idx > start_idx && cigar[end_idx - 1].kind() == Kind::SoftClip { end_idx -= 1; } @@ -1299,23 +1300,23 @@ fn rebuild_cigar_without_softclips( for (i, op) in body.iter().enumerate() { if i == 0 && left_clip > 0 { // Fold left_clip into the first op if it's match-like. - match op { - CigarOp::Match(n) => out.push(CigarOp::Match(n + left_clip)), - CigarOp::Equal(n) => out.push(CigarOp::Equal(n + left_clip)), + match op.kind() { + Kind::Match | Kind::SequenceMatch => out.push(op.add_len(left_clip)), _ => { // Extension landed on a non-match op (shouldn't normally // happen). Emit as Match. - out.push(CigarOp::Match(left_clip)); + out.push(Op::new(Kind::Match, left_clip)); out.push(*op); } } } else if i + 1 == body.len() && right_clip > 0 { - match op { - CigarOp::Match(n) => out.push(CigarOp::Match(n + right_clip)), - CigarOp::Equal(n) => out.push(CigarOp::Equal(n + right_clip)), + match op.kind() { + Kind::Match | Kind::SequenceMatch => { + out.push(op.add_len(right_clip)); + } _ => { out.push(*op); - out.push(CigarOp::Match(right_clip)); + out.push(Op::new(Kind::Match, right_clip)); } } } else { @@ -1382,6 +1383,8 @@ fn is_splice_boundary_before(align: &Transcript, iab: usize) -> bool { #[cfg(test)] mod tests { + use noodles::sam::alignment::record::cigar; + use super::*; fn make_genome() -> Genome { @@ -1952,7 +1955,7 @@ mod tests { chr_idx: usize, is_reverse: bool, exons: Vec<(u64, u64, usize, usize)>, - cigar: Vec, + cigar: Vec, ) -> Transcript { let proj_exons: Vec = exons .into_iter() @@ -1985,13 +1988,19 @@ mod tests { #[test] fn project_single_exon_align_into_single_exon_transcript() { + use cigar::op::{Kind, Op}; let genome = make_genome(); // Transcript: chr1 [100, 200) forward. let gtf = vec![make_exon("chr1", 101, 200, '+', "G1", "T1")]; let idx = TranscriptomeIndex::from_gtf_exons(>f, &genome).unwrap(); // Align fully inside exon: genome [110, 150), read [0, 40). - let align = make_align(0, false, vec![(110, 150, 0, 40)], vec![CigarOp::Match(40)]); + let align = make_align( + 0, + false, + vec![(110, 150, 0, 40)], + vec![Op::new(Kind::Match, 40)], + ); let results = align_to_transcripts(&align, &idx, 40); assert_eq!(results.len(), 1); let r = &results[0]; @@ -2004,12 +2013,13 @@ mod tests { assert_eq!(r.exons[0].read_start, 0); assert_eq!(r.exons[0].read_end, 40); // CIGAR must have no N - assert!(r.cigar.iter().all(|op| !matches!(op, CigarOp::RefSkip(_)))); + assert!(r.cigar.iter().all(|op| op.kind() != Kind::Skip)); assert_eq!(r.cigar.len(), 1); } #[test] fn project_two_exon_align_matching_junction() { + use cigar::op::{Kind, Op}; let genome = make_genome(); // Transcript T1: chr1 [100, 200) + [300, 400) forward. tr_length = 200. let gtf = vec![ @@ -2025,9 +2035,9 @@ mod tests { false, vec![(150, 200, 0, 50), (300, 350, 50, 100)], vec![ - CigarOp::Match(50), - CigarOp::RefSkip(100), - CigarOp::Match(50), + Op::new(Kind::Match, 50), + Op::new(Kind::Skip, 100), + Op::new(Kind::Match, 50), ], ); let results = align_to_transcripts(&align, &idx, 100); @@ -2042,12 +2052,13 @@ mod tests { assert_eq!(r.exons[1].genome_start, 100); assert_eq!(r.exons[1].genome_end, 150); // CIGAR: no N - assert!(r.cigar.iter().all(|op| !matches!(op, CigarOp::RefSkip(_)))); + assert!(r.cigar.iter().all(|op| op.kind() != Kind::Skip)); assert_eq!(r.cigar.len(), 2); } #[test] fn project_mismatched_junction_fails() { + use cigar::op::{Kind, Op}; let genome = make_genome(); let gtf = vec![ make_exon("chr1", 101, 200, '+', "G1", "T1"), @@ -2061,9 +2072,9 @@ mod tests { false, vec![(150, 195, 0, 45), (305, 350, 45, 90)], vec![ - CigarOp::Match(45), - CigarOp::RefSkip(110), - CigarOp::Match(45), + Op::new(Kind::Match, 45), + Op::new(Kind::Skip, 110), + Op::new(Kind::Match, 45), ], ); let results = align_to_transcripts(&align, &idx, 90); @@ -2072,13 +2083,19 @@ mod tests { #[test] fn project_onto_reverse_strand_transcript() { + use cigar::op::{Kind, Op}; let genome = make_genome(); // Reverse transcript: chr1 [100, 200) - strand. tr_length = 100. let gtf = vec![make_exon("chr1", 101, 200, '-', "G1", "T1")]; let idx = TranscriptomeIndex::from_gtf_exons(>f, &genome).unwrap(); // Align forward on genome [120, 160), read [0, 40). - let align = make_align(0, false, vec![(120, 160, 0, 40)], vec![CigarOp::Match(40)]); + let align = make_align( + 0, + false, + vec![(120, 160, 0, 40)], + vec![Op::new(Kind::Match, 40)], + ); let results = align_to_transcripts(&align, &idx, 40); assert_eq!(results.len(), 1); let r = &results[0]; @@ -2096,6 +2113,7 @@ mod tests { #[test] fn project_multi_exon_align_onto_longer_transcript() { + use cigar::op::{Kind, Op}; let genome = make_genome(); // 3-exon transcript: [100,200) + [300,400) + [500,600). tr_length = 300. let gtf = vec![ @@ -2111,9 +2129,9 @@ mod tests { false, vec![(350, 400, 0, 50), (500, 550, 50, 100)], vec![ - CigarOp::Match(50), - CigarOp::RefSkip(100), - CigarOp::Match(50), + Op::new(Kind::Match, 50), + Op::new(Kind::Skip, 100), + Op::new(Kind::Match, 50), ], ); let results = align_to_transcripts(&align, &idx, 100); @@ -2130,6 +2148,7 @@ mod tests { #[test] fn project_past_transcript_end_fails() { + use cigar::op::{Kind, Op}; let genome = make_genome(); let gtf = vec![make_exon("chr1", 101, 200, '+', "G1", "T1")]; let idx = TranscriptomeIndex::from_gtf_exons(>f, &genome).unwrap(); @@ -2139,7 +2158,7 @@ mod tests { 0, false, vec![(150, 250, 0, 100)], - vec![CigarOp::Match(100)], + vec![Op::new(Kind::Match, 100)], ); let results = align_to_transcripts(&align, &idx, 100); assert_eq!(results.len(), 0); @@ -2147,17 +2166,24 @@ mod tests { #[test] fn project_before_all_transcripts_returns_empty() { + use cigar::op::{Kind, Op}; let genome = make_genome(); let gtf = vec![make_exon("chr1", 501, 600, '+', "G1", "T1")]; let idx = TranscriptomeIndex::from_gtf_exons(>f, &genome).unwrap(); - let align = make_align(0, false, vec![(100, 150, 0, 50)], vec![CigarOp::Match(50)]); + let align = make_align( + 0, + false, + vec![(100, 150, 0, 50)], + vec![Op::new(Kind::Match, 50)], + ); let results = align_to_transcripts(&align, &idx, 50); assert_eq!(results.len(), 0); } #[test] fn project_onto_multiple_overlapping_transcripts() { + use cigar::op::{Kind, Op}; let genome = make_genome(); // Two transcripts both containing the align: // T1: [100, 400) single exon @@ -2168,7 +2194,12 @@ mod tests { ]; let idx = TranscriptomeIndex::from_gtf_exons(>f, &genome).unwrap(); - let align = make_align(0, false, vec![(200, 250, 0, 50)], vec![CigarOp::Match(50)]); + let align = make_align( + 0, + false, + vec![(200, 250, 0, 50)], + vec![Op::new(Kind::Match, 50)], + ); let results = align_to_transcripts(&align, &idx, 50); assert_eq!(results.len(), 2); } @@ -2210,10 +2241,16 @@ mod tests { #[test] fn filter_default_rejects_indels() { + use cigar::op::{Kind, Op}; let genome = make_genome(); let gtf = vec![make_exon("chr1", 101, 300, '+', "G1", "T1")]; let idx = TranscriptomeIndex::from_gtf_exons(>f, &genome).unwrap(); - let mut align = make_align(0, false, vec![(110, 150, 0, 40)], vec![CigarOp::Match(40)]); + let mut align = make_align( + 0, + false, + vec![(110, 150, 0, 40)], + vec![Op::new(Kind::Match, 40)], + ); align.n_gap = 1; // simulate insertion/deletion let params = default_params(); let read = vec![0u8; 40]; @@ -2231,10 +2268,16 @@ mod tests { #[test] fn filter_keeps_indels_when_allowed() { + use cigar::op::{Kind, Op}; let genome = make_genome(); let gtf = vec![make_exon("chr1", 101, 300, '+', "G1", "T1")]; let idx = TranscriptomeIndex::from_gtf_exons(>f, &genome).unwrap(); - let mut align = make_align(0, false, vec![(110, 150, 0, 40)], vec![CigarOp::Match(40)]); + let mut align = make_align( + 0, + false, + vec![(110, 150, 0, 40)], + vec![Op::new(Kind::Match, 40)], + ); align.n_gap = 1; let params = default_params(); let read = vec![0u8; 40]; @@ -2252,6 +2295,7 @@ mod tests { #[test] fn filter_extends_left_softclip_with_zero_mismatches() { + use cigar::op::{Kind, Op}; // Build a custom genome with known content so we can construct a // read whose soft-clipped bases match the adjacent genome. let mut seq = vec![4u8; 1000]; @@ -2276,7 +2320,7 @@ mod tests { 0, false, vec![(104, 144, 4, 44)], - vec![CigarOp::SoftClip(4), CigarOp::Match(40)], + vec![Op::new(Kind::SoftClip, 4), Op::new(Kind::Match, 40)], ); // Read is 44 bases of A (0s) — matches all of genome [100, 144). let read = vec![0u8; 44]; @@ -2295,14 +2339,15 @@ mod tests { let r = &results[0]; // CIGAR should now be a single 44M (4S folded into leading M). assert_eq!(r.cigar.len(), 1); - match r.cigar[0] { - CigarOp::Match(n) => assert_eq!(n, 44), + match r.cigar[0].kind() { + Kind::Match => assert_eq!(r.cigar[0].len(), 44), _ => panic!("expected Match(44)"), } } #[test] fn filter_extends_softclip_too_many_mismatches_rejects() { + use cigar::op::{Kind, Op}; // Left clip is 4 bases, all mismatches. n_mismatch = 0 to start. // With very tight out_filter_mismatch_nmax, the alignment is rejected. let mut seq = vec![4u8; 1000]; @@ -2325,7 +2370,7 @@ mod tests { 0, false, vec![(104, 144, 4, 44)], - vec![CigarOp::SoftClip(4), CigarOp::Match(40)], + vec![Op::new(Kind::SoftClip, 4), Op::new(Kind::Match, 40)], ); // Read clip bases are all T (3) — mismatch against genome A (0) let mut read = vec![0u8; 44]; @@ -2350,6 +2395,7 @@ mod tests { #[test] fn filter_mode_single_end_keeps_softclip_as_is() { + use cigar::op::{Kind, Op}; let genome = make_genome(); let gtf = vec![make_exon("chr1", 101, 300, '+', "G1", "T1")]; let idx = TranscriptomeIndex::from_gtf_exons(>f, &genome).unwrap(); @@ -2358,7 +2404,7 @@ mod tests { 0, false, vec![(110, 150, 4, 44)], - vec![CigarOp::SoftClip(4), CigarOp::Match(40)], + vec![Op::new(Kind::SoftClip, 4), Op::new(Kind::Match, 40)], ); let read = vec![0u8; 44]; let params = default_params(); @@ -2379,7 +2425,7 @@ mod tests { results[0] .cigar .iter() - .any(|op| matches!(op, CigarOp::SoftClip(_))) + .any(|op| op.kind() == Kind::SoftClip) ); } diff --git a/src/stats.rs b/src/stats.rs index 620c798..01c2394 100644 --- a/src/stats.rs +++ b/src/stats.rs @@ -1,10 +1,11 @@ -/// Alignment statistics tracking and reporting +//! Alignment statistics tracking and reporting use log::info; use std::path::Path; use std::sync::atomic::{AtomicU64, Ordering}; -use crate::align::transcript::{CigarOp, Transcript}; +use crate::align::transcript::Transcript; use crate::junction::encode_motif; +use noodles::sam::alignment::record::cigar; /// Reason a read could not be mapped #[derive(Debug, Clone, Copy, PartialEq, Eq)] @@ -144,23 +145,25 @@ impl AlignmentStats { /// Walks the CIGAR to extract mapped bases, ins/del counts, and splice motif counts. /// Only call this for unique mappers (n_alignments == 1). pub fn record_transcript_stats(&self, transcript: &Transcript) { + use cigar::op::Kind; // Walk CIGAR for mapped bases, ins/del for op in &transcript.cigar { - match op { - CigarOp::Match(len) | CigarOp::Equal(len) | CigarOp::Diff(len) => { - self.mapped_bases.fetch_add(*len as u64, Ordering::Relaxed); + match op.kind() { + Kind::Match | Kind::SequenceMatch | Kind::SequenceMismatch => { + self.mapped_bases + .fetch_add(op.len() as u64, Ordering::Relaxed); } - CigarOp::Ins(len) => { + Kind::Insertion => { self.mapped_ins_count.fetch_add(1, Ordering::Relaxed); self.mapped_ins_bases - .fetch_add(*len as u64, Ordering::Relaxed); + .fetch_add(op.len() as u64, Ordering::Relaxed); } - CigarOp::Del(len) => { + Kind::Deletion => { self.mapped_del_count.fetch_add(1, Ordering::Relaxed); self.mapped_del_bases - .fetch_add(*len as u64, Ordering::Relaxed); + .fetch_add(op.len() as u64, Ordering::Relaxed); } - CigarOp::RefSkip(_) | CigarOp::SoftClip(_) | CigarOp::HardClip(_) => {} + Kind::Skip | Kind::SoftClip | Kind::HardClip | Kind::Pad => {} } } @@ -761,6 +764,7 @@ mod tests { fn test_record_transcript_stats() { use crate::align::score::SpliceMotif; use crate::align::transcript::Exon; + use cigar::op::{Kind, Op}; let stats = AlignmentStats::new(); @@ -778,14 +782,14 @@ mod tests { i_frag: 0, }], cigar: vec![ - CigarOp::SoftClip(5), - CigarOp::Match(45), - CigarOp::Ins(3), - CigarOp::Match(2), - CigarOp::RefSkip(100), - CigarOp::Match(50), - CigarOp::Del(2), - CigarOp::Match(5), + Op::new(Kind::SoftClip, 5), + Op::new(Kind::Match, 45), + Op::new(Kind::Insertion, 3), + Op::new(Kind::Match, 2), + Op::new(Kind::Skip, 100), + Op::new(Kind::Match, 50), + Op::new(Kind::Deletion, 2), + Op::new(Kind::Match, 5), ], score: 100, n_mismatch: 3, @@ -832,6 +836,7 @@ mod tests { fn test_splice_motif_aggregation() { use crate::align::score::SpliceMotif; use crate::align::transcript::Exon; + use cigar::op::{Kind, Op}; let stats = AlignmentStats::new(); @@ -848,7 +853,7 @@ mod tests { read_end: 100, i_frag: 0, }], - cigar: vec![CigarOp::Match(100)], + cigar: vec![Op::new(Kind::Match, 100)], score: 100, n_mismatch: 0, n_gap: 0,