From 8ffaddd655e3616cc8b5c57f889040faf7ab82ca Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 12 May 2026 18:23:42 +0100 Subject: [PATCH] fix(transcriptome-bam): populate mate fields on paired-end records build_transcriptome_records_pe was stamping SEGMENTED + FIRST/LAST_SEGMENT but leaving PROPERLY_SEGMENTED, MATE_REVERSE_COMPLEMENTED, RNEXT, PNEXT, and TLEN unset on every transcriptome record. Salmon's alignment-mode fragment-length inference reads those mate fields; with them missing it falls back to its 250+/-25 prior, distorting paired-end TPMs. Walk the projected (rec1, rec2) pairs after the existing flag-stamping loops and mirror the mate-bookkeeping the genome-space build_paired_mate_record already does: PROPERLY_SEGMENTED + MATE_REVERSE_COMPLEMENTED + mate ref id + mate position + signed TLEN (leftmost +, rightmost -). Fixes #22 --- src/align/read_align.rs | 2 +- src/io/sam.rs | 186 ++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 9 ++ 3 files changed, 196 insertions(+), 1 deletion(-) diff --git a/src/align/read_align.rs b/src/align/read_align.rs index 8d188e8..fc754e6 100644 --- a/src/align/read_align.rs +++ b/src/align/read_align.rs @@ -1343,7 +1343,7 @@ fn check_proper_pair( } /// Calculate signed insert size (TLEN) -fn calculate_insert_size(mate1_trans: &Transcript, mate2_trans: &Transcript) -> i32 { +pub(crate) fn calculate_insert_size(mate1_trans: &Transcript, mate2_trans: &Transcript) -> i32 { // STAR outSAMtlen=1 (default): tlen is computed from the combined PE transcript span, // not from max/min of individual mate endpoints. // diff --git a/src/io/sam.rs b/src/io/sam.rs index 9e5cff5..472e9b6 100644 --- a/src/io/sam.rs +++ b/src/io/sam.rs @@ -1158,6 +1158,50 @@ pub(crate) fn convert_cigar(ops: &[CigarOp]) -> Result Result<(), Error> { + use crate::align::read_align::calculate_insert_size; + use sam::alignment::record::Flags; + + *rec1.flags_mut() |= Flags::PROPERLY_SEGMENTED; + *rec2.flags_mut() |= Flags::PROPERLY_SEGMENTED; + + if rec2.flags().is_reverse_complemented() { + *rec1.flags_mut() |= Flags::MATE_REVERSE_COMPLEMENTED; + } + if rec1.flags().is_reverse_complemented() { + *rec2.flags_mut() |= Flags::MATE_REVERSE_COMPLEMENTED; + } + + *rec1.mate_reference_sequence_id_mut() = Some(t2.chr_idx); + *rec2.mate_reference_sequence_id_mut() = Some(t1.chr_idx); + + let pos1 = (t1.genome_start + 1) as usize; + let pos2 = (t2.genome_start + 1) as usize; + *rec1.mate_alignment_start_mut() = Some( + pos2.try_into() + .map_err(|e| Error::Alignment(format!("invalid mate position {}: {}", pos2, e)))?, + ); + *rec2.mate_alignment_start_mut() = Some( + pos1.try_into() + .map_err(|e| Error::Alignment(format!("invalid mate position {}: {}", pos1, e)))?, + ); + + let tlen = calculate_insert_size(t1, t2); + *rec1.template_length_mut() = tlen; + *rec2.template_length_mut() = -tlen; + + Ok(()) +} + /// Build a SAM record for one mate of a paired-end read #[allow(clippy::too_many_arguments)] fn build_paired_mate_record( @@ -1773,6 +1817,148 @@ mod tests { ); } + #[test] + fn test_apply_pe_transcriptome_mate_fields() { + use crate::align::transcript::Exon; + use sam::alignment::record::Flags; + + let chr_idx = 7; + + let t1 = Transcript { + chr_idx, + genome_start: 100, + genome_end: 130, + is_reverse: false, + exons: vec![Exon { + genome_start: 100, + genome_end: 130, + read_start: 0, + read_end: 30, + i_frag: 0, + }], + cigar: vec![CigarOp::Match(30)], + score: 60, + n_mismatch: 0, + n_gap: 0, + n_junction: 0, + junction_motifs: vec![], + junction_annotated: vec![], + read_seq: vec![], + }; + + let t2 = Transcript { + chr_idx, + genome_start: 200, + genome_end: 230, + is_reverse: true, + exons: vec![Exon { + genome_start: 200, + genome_end: 230, + read_start: 0, + read_end: 30, + i_frag: 0, + }], + cigar: vec![CigarOp::Match(30)], + score: 60, + n_mismatch: 0, + n_gap: 0, + n_junction: 0, + junction_motifs: vec![], + junction_annotated: vec![], + read_seq: vec![], + }; + + let mut rec1 = RecordBuf::default(); + *rec1.flags_mut() = Flags::SEGMENTED | Flags::FIRST_SEGMENT; + let mut rec2 = RecordBuf::default(); + *rec2.flags_mut() = Flags::SEGMENTED | Flags::LAST_SEGMENT | Flags::REVERSE_COMPLEMENTED; + + apply_pe_transcriptome_mate_fields(&mut rec1, &mut rec2, &t1, &t2).unwrap(); + + assert!(rec1.flags().is_properly_segmented()); + assert!(rec2.flags().is_properly_segmented()); + + assert!(rec1.flags().is_mate_reverse_complemented()); + assert!(!rec2.flags().is_mate_reverse_complemented()); + + assert_eq!(rec1.mate_reference_sequence_id(), Some(chr_idx)); + assert_eq!(rec2.mate_reference_sequence_id(), Some(chr_idx)); + + let pnext1 = rec1.mate_alignment_start().unwrap(); + let pnext2 = rec2.mate_alignment_start().unwrap(); + assert_eq!(usize::from(pnext1), 201); + assert_eq!(usize::from(pnext2), 101); + + assert_ne!(rec1.template_length(), 0); + assert_ne!(rec2.template_length(), 0); + assert!(rec1.template_length() > 0); + assert!(rec2.template_length() < 0); + assert_eq!(rec1.template_length(), -rec2.template_length()); + } + + #[test] + fn test_apply_pe_transcriptome_mate_fields_reverse_cluster() { + use crate::align::transcript::Exon; + use sam::alignment::record::Flags; + + let chr_idx = 3; + + let t1 = Transcript { + chr_idx, + genome_start: 200, + genome_end: 230, + is_reverse: true, + exons: vec![Exon { + genome_start: 200, + genome_end: 230, + read_start: 0, + read_end: 30, + i_frag: 0, + }], + cigar: vec![CigarOp::Match(30)], + score: 60, + n_mismatch: 0, + n_gap: 0, + n_junction: 0, + junction_motifs: vec![], + junction_annotated: vec![], + read_seq: vec![], + }; + + let t2 = Transcript { + chr_idx, + genome_start: 100, + genome_end: 130, + is_reverse: false, + exons: vec![Exon { + genome_start: 100, + genome_end: 130, + read_start: 0, + read_end: 30, + i_frag: 0, + }], + cigar: vec![CigarOp::Match(30)], + score: 60, + n_mismatch: 0, + n_gap: 0, + n_junction: 0, + junction_motifs: vec![], + junction_annotated: vec![], + read_seq: vec![], + }; + + let mut rec1 = RecordBuf::default(); + *rec1.flags_mut() = Flags::SEGMENTED | Flags::FIRST_SEGMENT | Flags::REVERSE_COMPLEMENTED; + let mut rec2 = RecordBuf::default(); + *rec2.flags_mut() = Flags::SEGMENTED | Flags::LAST_SEGMENT; + + apply_pe_transcriptome_mate_fields(&mut rec1, &mut rec2, &t1, &t2).unwrap(); + + assert!(rec1.template_length() < 0); + assert!(rec2.template_length() > 0); + assert_eq!(rec1.template_length(), -rec2.template_length()); + } + #[test] fn test_tags_nh_hi_as_nm() { let genome = make_test_genome(); diff --git a/src/lib.rs b/src/lib.rs index d229763..bad18b8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -767,6 +767,15 @@ where *r.flags_mut() |= Flags::SEGMENTED | Flags::LAST_SEGMENT; } + for (((r1, r2), p1), p2) in rec1s + .iter_mut() + .zip(rec2s.iter_mut()) + .zip(p1s.iter()) + .zip(p2s.iter()) + { + crate::io::sam::apply_pe_transcriptome_mate_fields(r1, r2, p1, p2)?; + } + let mut out: Vec = Vec::with_capacity(n_alignments * 2); for (r1, r2) in rec1s.into_iter().zip(rec2s) {