diff --git a/src/io/sam.rs b/src/io/sam.rs index 9e5cff5..409dd29 100644 --- a/src/io/sam.rs +++ b/src/io/sam.rs @@ -103,10 +103,7 @@ impl SamWriter { }; let effective_n = n_alignments.max(n_for_mapq); let mapq = calculate_mapq(effective_n, params.out_sam_mapq_unique); - let mut attrs = params.sam_attribute_set(); - if params.out_sam_strand_field != "intronMotif" { - attrs.remove("XS"); - } + let attrs = params.effective_sam_attribute_set(); let rg_id_owned = params.primary_rg_id()?; let rg_id = rg_id_owned.as_deref(); @@ -232,10 +229,7 @@ impl SamWriter { }; let effective_n = n_alignments.max(n_for_mapq); let mapq = calculate_mapq(effective_n, params.out_sam_mapq_unique); - let mut attrs = params.sam_attribute_set(); - if params.out_sam_strand_field != "intronMotif" { - attrs.remove("XS"); - } + let attrs = params.effective_sam_attribute_set(); let rg_id_owned = params.primary_rg_id()?; let rg_id = rg_id_owned.as_deref(); @@ -299,10 +293,7 @@ impl SamWriter { }; let effective_n = n_alignments.max(n_for_mapq); let mapq = calculate_mapq(effective_n, params.out_sam_mapq_unique); - let mut attrs = params.sam_attribute_set(); - if params.out_sam_strand_field != "intronMotif" { - attrs.remove("XS"); - } + let attrs = params.effective_sam_attribute_set(); let rg_id_owned = params.primary_rg_id()?; let rg_id = rg_id_owned.as_deref(); @@ -385,10 +376,7 @@ impl SamWriter { let n_alignments = 1usize; let effective_n = n_alignments.max(n_for_mapq); let mapq = calculate_mapq(effective_n, params.out_sam_mapq_unique); - let mut attrs = params.sam_attribute_set(); - if params.out_sam_strand_field != "intronMotif" { - attrs.remove("XS"); - } + let attrs = params.effective_sam_attribute_set(); let rg_id_owned = params.primary_rg_id()?; let rg_id = rg_id_owned.as_deref(); @@ -3596,4 +3584,127 @@ mod tests { assert!(records_m2[0].flags().is_unmapped()); // mate1 is unmapped assert!(!records_m2[1].flags().is_unmapped()); // mate2 is mapped } + + fn spliced_gtag_transcript() -> Transcript { + Transcript { + chr_idx: 0, + genome_start: 0, + genome_end: 200, + is_reverse: false, + exons: vec![], + cigar: vec![ + CigarOp::Match(25), + CigarOp::RefSkip(100), + CigarOp::Match(25), + ], + score: 50, + n_mismatch: 0, + n_gap: 0, + n_junction: 1, + junction_motifs: vec![SpliceMotif::GtAg], + junction_annotated: vec![false], + read_seq: vec![0; 4], + } + } + + #[test] + fn xs_tag_emitted_when_strand_field_intron_motif_only() { + let genome = make_test_genome(); + let params = Parameters::parse_from(vec![ + "rustar-aligner", + "--readFilesIn", + "r.fq", + "--outSAMstrandField", + "intronMotif", + ]); + + let transcripts = vec![spliced_gtag_transcript()]; + let read_seq = vec![0, 1, 2, 3]; + let read_qual = vec![30, 30, 30, 30]; + + let records = SamWriter::build_alignment_records( + "read1", + &read_seq, + &read_qual, + &transcripts, + &genome, + ¶ms, + 1, + ) + .unwrap(); + + assert_eq!(records.len(), 1); + let data = records[0].data(); + assert_eq!( + data.get(&Tag::new(b'X', b'S')), + Some(&Value::Character(b'+')), + "XS:A:+ should be emitted when --outSAMstrandField intronMotif is set" + ); + } + + #[test] + fn xs_tag_emitted_when_xs_in_attributes_only() { + let genome = make_test_genome(); + let params = Parameters::parse_from(vec![ + "rustar-aligner", + "--readFilesIn", + "r.fq", + "--outSAMattributes", + "NH", + "HI", + "XS", + ]); + + let transcripts = vec![spliced_gtag_transcript()]; + let read_seq = vec![0, 1, 2, 3]; + let read_qual = vec![30, 30, 30, 30]; + + let records = SamWriter::build_alignment_records( + "read1", + &read_seq, + &read_qual, + &transcripts, + &genome, + ¶ms, + 1, + ) + .unwrap(); + + assert_eq!(records.len(), 1); + let data = records[0].data(); + assert_eq!( + data.get(&Tag::new(b'X', b'S')), + Some(&Value::Character(b'+')), + "XS:A:+ should be emitted when XS is listed in --outSAMattributes" + ); + } + + #[test] + fn xs_tag_absent_when_neither_xs_nor_intron_motif_requested() { + let genome = make_test_genome(); + let params = Parameters::parse_from(vec!["rustar-aligner", "--readFilesIn", "r.fq"]); + + let transcripts = vec![spliced_gtag_transcript()]; + let read_seq = vec![0, 1, 2, 3]; + let read_qual = vec![30, 30, 30, 30]; + + let records = SamWriter::build_alignment_records( + "read1", + &read_seq, + &read_qual, + &transcripts, + &genome, + ¶ms, + 1, + ) + .unwrap(); + + assert_eq!(records.len(), 1); + let data = records[0].data(); + assert_eq!( + data.get(&Tag::new(b'X', b'S')), + None, + "XS should not be emitted under default attributes and strand field" + ); + } } diff --git a/src/params.rs b/src/params.rs index 9c124b8..ebb6e93 100644 --- a/src/params.rs +++ b/src/params.rs @@ -777,6 +777,33 @@ impl Parameters { attrs } + /// Effective `--outSAMattributes` set after coupling with `--outSAMstrandField`. + /// + /// Mirrors STAR's `Parameters_samAttributes.cpp:172-179` and `:213-216`: + /// `XS` in the attribute set and `--outSAMstrandField intronMotif` imply + /// each other. If either is set the returned set contains `XS`; otherwise + /// `XS` is excluded even if it was listed in `--outSAMattributes` without + /// a matching strand field (STAR forces the strand field on, so XS stays). + pub fn effective_sam_attribute_set(&self) -> HashSet { + let mut attrs = self.sam_attribute_set(); + if self.effective_out_sam_strand_field() == "intronMotif" { + attrs.insert("XS".to_string()); + } + attrs + } + + /// Effective `--outSAMstrandField` value after coupling with `--outSAMattributes`. + /// + /// `XS` in `--outSAMattributes` forces `intronMotif` even when the user + /// left `--outSAMstrandField` at its default (`None`). + pub fn effective_out_sam_strand_field(&self) -> &str { + if self.out_sam_strand_field == "intronMotif" || self.sam_attribute_set().contains("XS") { + "intronMotif" + } else { + self.out_sam_strand_field.as_str() + } + } + /// True if the user provided a non-default `--outSAMattrRGline`. pub fn rg_line_set(&self) -> bool { !self.out_sam_attr_rg_line.is_empty() && self.out_sam_attr_rg_line[0] != "-" @@ -953,6 +980,19 @@ impl Parameters { } self.rg_ids()?; + let user_xs_in_attrs = self.sam_attribute_set().contains("XS"); + let user_strand_intron_motif = self.out_sam_strand_field == "intronMotif"; + if user_xs_in_attrs && !user_strand_intron_motif { + log::info!( + "--outSAMattributes contains XS, therefore rustar-aligner will use --outSAMstrandField intronMotif" + ); + } + if user_strand_intron_motif && !user_xs_in_attrs { + log::info!( + "--outSAMstrandField=intronMotif, therefore rustar-aligner will output XS attribute" + ); + } + // quantMode TranscriptomeSAM requires transcript annotations — // either via --sjdbGTFfile or pre-generated transcriptInfo.tab // et al in --genomeDir (persisted at genomeGenerate time). At @@ -1488,4 +1528,47 @@ mod tests { ]); assert_eq!(p.align_sj_stitch_mismatch_nmax, vec![1, -1, 2, 3]); } + + #[test] + fn xs_strand_field_intron_motif_adds_xs_to_attrs() { + let p = parse(&[ + "--readFilesIn", + "r.fq", + "--outSAMstrandField", + "intronMotif", + ]); + let attrs = p.effective_sam_attribute_set(); + assert!(attrs.contains("XS")); + assert_eq!(p.effective_out_sam_strand_field(), "intronMotif"); + } + + #[test] + fn xs_attr_forces_intron_motif_strand_field() { + let p = parse(&[ + "--readFilesIn", + "r.fq", + "--outSAMattributes", + "NH", + "HI", + "XS", + ]); + assert_eq!(p.effective_out_sam_strand_field(), "intronMotif"); + let attrs = p.effective_sam_attribute_set(); + assert!(attrs.contains("XS")); + } + + #[test] + fn xs_coupling_dormant_without_user_request() { + let p = parse(&["--readFilesIn", "r.fq"]); + let attrs = p.effective_sam_attribute_set(); + assert!(!attrs.contains("XS")); + assert_eq!(p.effective_out_sam_strand_field(), "None"); + } + + #[test] + fn xs_attr_via_all_preset_couples_strand_field() { + let p = parse(&["--readFilesIn", "r.fq", "--outSAMattributes", "All"]); + assert_eq!(p.effective_out_sam_strand_field(), "intronMotif"); + assert!(p.effective_sam_attribute_set().contains("XS")); + } }