From abf856ecb9721afc943ec0b9c286b80f88d9d99c Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 12 May 2026 20:12:13 +0100 Subject: [PATCH] fix: decode Gsj-region SA hits into split splice seeds Pass-1 alignment was silently dropping SA hits that fall in the appended splice-junction flanking buffer (Gsj). Those hits encode candidate splice events for annotated junctions: when a read seed straddles the donor/acceptor boundary of a Gsj slot, the matching genome bytes live in two non-adjacent real-genome positions. STAR decodes these via g1 >= P.nGenomeReal and feeds the (donor, acceptor) pair into the seed pipeline; rustar-aligner was discarding them at position_to_chr (which returns None for positions past the last real chromosome). Plumb n_genome_real onto Genome (pinned at the pre-sjdb forward total) and reload prepared junctions + sjdbOverhang from sjdbInfo.txt on the index-load path. cluster_seeds now expands each SA hit through decode_gsj_hit: real-genome hits pass through unchanged, single-flank Gsj hits are skipped (the equivalent real-genome SA entry already exists in the same range), and boundary-crossing Gsj hits split into two virtual seeds with the donor-side read run and the acceptor-side read run pointing at their respective real-genome positions. The existing stitch DP then chains them via its splice branch. Verified on the yeast SRR6357072 reproducer from the issue: Number of splices: Total jumps from 366 to 631 (target ~720) and GT/AG from 266 to 531 with non-canonical unchanged at 93. The remaining gap is gated on PR #45's annotated-junction lookup landing. Fixes #47 Co-Authored-By: Claude --- src/align/read_align.rs | 2 + src/align/score.rs | 1 + src/align/stitch.rs | 321 +++++++++++++++++++++-------------- src/chimeric/detect.rs | 2 + src/chimeric/output.rs | 1 + src/chimeric/score.rs | 1 + src/genome/mod.rs | 7 + src/index/io.rs | 26 ++- src/index/mod.rs | 20 ++- src/io/bam.rs | 1 + src/io/sam.rs | 1 + src/junction/gtf.rs | 6 + src/junction/sj_output.rs | 4 + src/junction/sjdb_insert.rs | 323 ++++++++++++++++++++++++++++++++++++ src/quant/mod.rs | 1 + src/quant/transcriptome.rs | 3 + 16 files changed, 588 insertions(+), 132 deletions(-) diff --git a/src/align/read_align.rs b/src/align/read_align.rs index 8d188e8..95e665e 100644 --- a/src/align/read_align.rs +++ b/src/align/read_align.rs @@ -1505,6 +1505,7 @@ mod tests { let genome = Genome { sequence, n_genome, + n_genome_real: n_genome, n_chr_real: 1, chr_name: vec!["chr1".to_string()], chr_length: vec![10], @@ -1535,6 +1536,7 @@ mod tests { junction_db: crate::junction::SpliceJunctionDb::empty(), transcriptome: None, prepared_junctions: Vec::new(), + sjdb_overhang: 0, } } diff --git a/src/align/score.rs b/src/align/score.rs index e93aa3a..8ded8f1 100644 --- a/src/align/score.rs +++ b/src/align/score.rs @@ -637,6 +637,7 @@ mod tests { Genome { sequence, n_genome, + n_genome_real: n_genome, n_chr_real: 1, chr_name: vec!["chr1".to_string()], chr_length: vec![seq.len() as u64], diff --git a/src/align/stitch.rs b/src/align/stitch.rs index 5c78d87..499653f 100644 --- a/src/align/stitch.rs +++ b/src/align/stitch.rs @@ -386,6 +386,58 @@ pub fn cluster_seeds( let seed_per_window_nmax = params.seed_per_window_nmax; let min_seed_length = params.seed_map_min; + let n_genome_real = index.genome.n_genome_real; + let n_genome = index.genome.n_genome; + let sjdb_overhang = index.sjdb_overhang; + let prepared = index.prepared_junctions.as_slice(); + + // Expand one raw SA hit into the candidate `(real_fwd_pos, read_offset, + // sub_length, sub_sa_pos)` tuples cluster_seeds should consume. Hits + // in the real genome pass through unchanged. Hits in the Gsj + // flanking buffer are decoded via the sjdb table - yielding one + // entry for hits confined to a single flank, two entries for hits + // that straddle the donor/acceptor boundary (so the stitch DP can + // chain them through its splice branch). + let expand_hit = |sa_pos: u64, strand: bool, length: usize| -> Vec<(u64, usize, usize, u64)> { + let raw_fwd = index.sa_pos_to_forward(sa_pos, strand, length); + if raw_fwd < n_genome_real { + return vec![(raw_fwd, 0, length, sa_pos)]; + } + if sjdb_overhang == 0 || prepared.is_empty() { + return Vec::new(); + } + let mut decoded = crate::junction::sjdb_insert::decode_gsj_hit( + raw_fwd, + length, + n_genome_real, + sjdb_overhang, + prepared, + ); + // Reverse-strand hits traverse the donor/acceptor halves in reverse + // read order: the leftmost forward bytes (donor flank) align to the + // last read bases. Swap the read offsets so each sub-seed's + // `read_pos = seed.read_pos + read_offset` lands at the right place + // in original-read coords. + if strand && decoded.len() == 2 { + let acceptor_len = decoded[1].2; + decoded[0].1 = acceptor_len; + decoded[1].1 = 0; + } + decoded + .into_iter() + .map(|(real_fwd, read_off, sub_len)| { + let sub_sa_pos = if strand { + n_genome + .saturating_sub(real_fwd) + .saturating_sub(sub_len as u64) + } else { + real_fwd + }; + (real_fwd, read_off, sub_len, sub_sa_pos) + }) + .collect() + }; + let anchor_set: Vec = seeds .iter() .map(|seed| { @@ -446,112 +498,119 @@ pub fn cluster_seeds( for (sa_pos, strand) in anchor.genome_positions(index) { // STAR uses MMP length directly without per-position verification. // All SA positions in the range match for the full MMP length by definition. - let length = anchor.length; - if length < min_seed_length { + let full_length = anchor.length; + if full_length < min_seed_length { continue; } - let forward_pos = index.sa_pos_to_forward(sa_pos, strand, length); - - let chr_idx = match index.genome.position_to_chr(forward_pos) { - Some(info) => info.0, - None => continue, - }; - - let anchor_bin = forward_pos >> win_bin_nbits; - - // STAR's stitchPieces: Phase 1 creates windows from anchor positions - // but does NOT populate WA entries. After flank extension, nWA is reset - // to 0 (line 115), then Phase 3 re-assigns ALL seeds through - // assignAlignToWindow. We match this by not adding entries here. + for (forward_pos, _read_off, sub_length, _sub_sa_pos) in + expand_hit(sa_pos, strand, full_length) + { + // Anchor sub-pieces shorter than the minimum still drive a + // window centre — STAR seeds new windows from Gsj hits even + // when the donor or acceptor half of the read is short. + let chr_idx = match index.genome.position_to_chr(forward_pos) { + Some(info) => info.0, + None => continue, + }; - // Check if this bin already has a window (STAR: skip creation, just assign) - if let Some(&win_idx) = win_bin.get(&(strand, anchor_bin)) { - let window = &mut windows[win_idx]; - if window.alive && window.chr_idx == chr_idx { - window.actual_start = window.actual_start.min(forward_pos); - window.actual_end = window.actual_end.max(forward_pos + length as u64); - continue; + let length = sub_length; + let anchor_bin = forward_pos >> win_bin_nbits; + + // STAR's stitchPieces: Phase 1 creates windows from anchor positions + // but does NOT populate WA entries. After flank extension, nWA is reset + // to 0 (line 115), then Phase 3 re-assigns ALL seeds through + // assignAlignToWindow. We match this by not adding entries here. + + // Check if this bin already has a window (STAR: skip creation, just assign) + if let Some(&win_idx) = win_bin.get(&(strand, anchor_bin)) { + let window = &mut windows[win_idx]; + if window.alive && window.chr_idx == chr_idx { + window.actual_start = window.actual_start.min(forward_pos); + window.actual_end = window.actual_end.max(forward_pos + length as u64); + continue; + } } - } - // Scan LEFT for existing window to merge with - let mut merge_left: Option = None; - for scan_bin in - (anchor_bin.saturating_sub(win_anchor_dist_nbins as u64)..anchor_bin).rev() - { - if let Some(&win_idx) = win_bin.get(&(strand, scan_bin)) { - let w = &windows[win_idx]; - if w.alive && w.chr_idx == chr_idx { - merge_left = Some(win_idx); - break; + // Scan LEFT for existing window to merge with + let mut merge_left: Option = None; + for scan_bin in + (anchor_bin.saturating_sub(win_anchor_dist_nbins as u64)..anchor_bin).rev() + { + if let Some(&win_idx) = win_bin.get(&(strand, scan_bin)) { + let w = &windows[win_idx]; + if w.alive && w.chr_idx == chr_idx { + merge_left = Some(win_idx); + break; + } } } - } - // Scan RIGHT for existing window to merge with - let mut merge_right: Option = None; - for scan_bin in (anchor_bin + 1)..=(anchor_bin + win_anchor_dist_nbins as u64) { - if let Some(&win_idx) = win_bin.get(&(strand, scan_bin)) { - let w = &windows[win_idx]; - if w.alive && w.chr_idx == chr_idx { - merge_right = Some(win_idx); - break; + // Scan RIGHT for existing window to merge with + let mut merge_right: Option = None; + for scan_bin in (anchor_bin + 1)..=(anchor_bin + win_anchor_dist_nbins as u64) { + if let Some(&win_idx) = win_bin.get(&(strand, scan_bin)) { + let w = &windows[win_idx]; + if w.alive && w.chr_idx == chr_idx { + merge_right = Some(win_idx); + break; + } } } - } - match (merge_left, merge_right) { - (Some(left_idx), Some(right_idx)) if left_idx != right_idx => { - // Merge both windows: extend left window to cover right + anchor - let right_window = &windows[right_idx]; - let new_bin_end = right_window.bin_end.max(anchor_bin); - let new_actual_start = right_window.actual_start.min(forward_pos); - let new_actual_end = right_window.actual_end.max(forward_pos + length as u64); - // Kill right window - windows[right_idx].alive = false; - - // Extend left window - let left_window = &mut windows[left_idx]; - left_window.bin_start = left_window.bin_start.min(anchor_bin); - left_window.bin_end = left_window.bin_end.max(new_bin_end); - left_window.actual_start = left_window.actual_start.min(new_actual_start); - left_window.actual_end = left_window.actual_end.max(new_actual_end); - - // Update winBin for all bins from left to right - for bin in left_window.bin_start..=left_window.bin_end { - win_bin.insert((strand, bin), left_idx); + match (merge_left, merge_right) { + (Some(left_idx), Some(right_idx)) if left_idx != right_idx => { + // Merge both windows: extend left window to cover right + anchor + let right_window = &windows[right_idx]; + let new_bin_end = right_window.bin_end.max(anchor_bin); + let new_actual_start = right_window.actual_start.min(forward_pos); + let new_actual_end = + right_window.actual_end.max(forward_pos + length as u64); + // Kill right window + windows[right_idx].alive = false; + + // Extend left window + let left_window = &mut windows[left_idx]; + left_window.bin_start = left_window.bin_start.min(anchor_bin); + left_window.bin_end = left_window.bin_end.max(new_bin_end); + left_window.actual_start = left_window.actual_start.min(new_actual_start); + left_window.actual_end = left_window.actual_end.max(new_actual_end); + + // Update winBin for all bins from left to right + for bin in left_window.bin_start..=left_window.bin_end { + win_bin.insert((strand, bin), left_idx); + } } - } - (Some(idx), _) | (_, Some(idx)) => { - // Merge with one existing window - let window = &mut windows[idx]; - window.bin_start = window.bin_start.min(anchor_bin); - window.bin_end = window.bin_end.max(anchor_bin); - window.actual_start = window.actual_start.min(forward_pos); - window.actual_end = window.actual_end.max(forward_pos + length as u64); - - // Update winBin for newly covered bins - for bin in window.bin_start..=window.bin_end { - win_bin.insert((strand, bin), idx); + (Some(idx), _) | (_, Some(idx)) => { + // Merge with one existing window + let window = &mut windows[idx]; + window.bin_start = window.bin_start.min(anchor_bin); + window.bin_end = window.bin_end.max(anchor_bin); + window.actual_start = window.actual_start.min(forward_pos); + window.actual_end = window.actual_end.max(forward_pos + length as u64); + + // Update winBin for newly covered bins + for bin in window.bin_start..=window.bin_end { + win_bin.insert((strand, bin), idx); + } + } + _ => { + // No merge: create new window + let new_idx = windows.len(); + win_bin.insert((strand, anchor_bin), new_idx); + windows.push(Window { + bin_start: anchor_bin, + bin_end: anchor_bin, + chr_idx, + is_reverse: strand, + anchor_idx, + alignments: Vec::new(), + actual_start: forward_pos, + actual_end: forward_pos + length as u64, + alive: true, + wa_lrec: 0, + }); } - } - _ => { - // No merge: create new window - let new_idx = windows.len(); - win_bin.insert((strand, anchor_bin), new_idx); - windows.push(Window { - bin_start: anchor_bin, - bin_end: anchor_bin, - chr_idx, - is_reverse: strand, - anchor_idx, - alignments: Vec::new(), - actual_start: forward_pos, - actual_end: forward_pos + length as u64, - alive: true, - wa_lrec: 0, - }); } } } @@ -608,6 +667,11 @@ pub fn cluster_seeds( is_anchor: bool, ps_rstart: usize, // positive-strand read start (sort key) mate_id: u8, // STAR: iFrag (overlap dedup must respect fragment boundaries) + /// Derived read start in original read coordinates. Identical to + /// `seeds[seed_idx].read_pos` for real-genome SA hits, advanced + /// by the donor-side length for the acceptor half of a Gsj + /// boundary-crossing hit. + read_pos: usize, } let mut win_candidates: Vec> = @@ -621,45 +685,46 @@ pub fn cluster_seeds( let is_anchor_seed = anchor_set[seed_idx]; for (sa_pos, strand) in seed.genome_positions(index) { - let length = seed.length; - if length < min_seed_length { + let full_length = seed.length; + if full_length < min_seed_length { continue; } - let forward_pos = index.sa_pos_to_forward(sa_pos, strand, length); - - let chr_idx = match index.genome.position_to_chr(forward_pos) { - Some(info) => info.0, - None => { - continue; - } - }; + for (forward_pos, read_off, sub_length, sub_sa_pos) in + expand_hit(sa_pos, strand, full_length) + { + let length = sub_length; + let chr_idx = match index.genome.position_to_chr(forward_pos) { + Some(info) => info.0, + None => continue, + }; - let seed_bin = forward_pos >> win_bin_nbits; + let seed_bin = forward_pos >> win_bin_nbits; - let win_idx = match win_bin.get(&(strand, seed_bin)) { - Some(&idx) if windows[idx].alive && windows[idx].chr_idx == chr_idx => idx, - _ => { - continue; - } - }; + let win_idx = match win_bin.get(&(strand, seed_bin)) { + Some(&idx) if windows[idx].alive && windows[idx].chr_idx == chr_idx => idx, + _ => continue, + }; - let ps_rstart = if windows[win_idx].is_reverse { - read_len - (length + seed.read_pos) - } else { - seed.read_pos - }; + let derived_read_pos = seed.read_pos + read_off; + let ps_rstart = if windows[win_idx].is_reverse { + read_len - (length + derived_read_pos) + } else { + derived_read_pos + }; - win_candidates[win_idx].push(WinCandidate { - seed_idx, - sa_pos, - forward_pos, - length, - n_loci, - is_anchor: is_anchor_seed, - ps_rstart, - mate_id: seed.mate_id, - }); + win_candidates[win_idx].push(WinCandidate { + seed_idx, + sa_pos: sub_sa_pos, + forward_pos, + length, + n_loci, + is_anchor: is_anchor_seed, + ps_rstart, + mate_id: seed.mate_id, + read_pos: derived_read_pos, + }); + } } } @@ -790,7 +855,7 @@ pub fn cluster_seeds( insert_pos, WindowAlignment { seed_idx, - read_pos: seed.read_pos, + read_pos: cand.read_pos, length, genome_pos: forward_pos, sa_pos, @@ -858,7 +923,7 @@ pub fn cluster_seeds( insert_pos, WindowAlignment { seed_idx, - read_pos: seed.read_pos, + read_pos: cand.read_pos, length, genome_pos: forward_pos, sa_pos, @@ -2969,6 +3034,7 @@ mod tests { let genome = Genome { sequence, n_genome, + n_genome_real: n_genome, n_chr_real: 1, chr_name: vec!["chr1".to_string()], chr_length: vec![10], @@ -2999,6 +3065,7 @@ mod tests { junction_db: crate::junction::SpliceJunctionDb::empty(), transcriptome: None, prepared_junctions: Vec::new(), + sjdb_overhang: 0, } } @@ -3091,6 +3158,7 @@ mod tests { let genome = Genome { sequence, n_genome, + n_genome_real: n_genome, n_chr_real: 1, chr_name: vec!["chr1".to_string()], chr_length: vec![seq.len() as u64], @@ -3119,6 +3187,7 @@ mod tests { junction_db: crate::junction::SpliceJunctionDb::empty(), transcriptome: None, prepared_junctions: Vec::new(), + sjdb_overhang: 0, } } diff --git a/src/chimeric/detect.rs b/src/chimeric/detect.rs index e248a84..7a0dee5 100644 --- a/src/chimeric/detect.rs +++ b/src/chimeric/detect.rs @@ -1000,6 +1000,7 @@ mod tests { Genome { sequence, n_genome, + n_genome_real: n_genome, n_chr_real: 2, chr_name: vec!["chr0".to_string(), "chr1".to_string()], chr_length: vec![chr_len, chr_len], @@ -1027,6 +1028,7 @@ mod tests { junction_db: SpliceJunctionDb::empty(), transcriptome: None, prepared_junctions: Vec::new(), + sjdb_overhang: 0, } } diff --git a/src/chimeric/output.rs b/src/chimeric/output.rs index 37f22b2..d5493c0 100644 --- a/src/chimeric/output.rs +++ b/src/chimeric/output.rs @@ -443,6 +443,7 @@ mod tests { Genome { sequence: vec![0u8; 2048], n_genome: 1024, + n_genome_real: 1024, n_chr_real: 2, chr_name: vec!["chr9".to_string(), "chr22".to_string()], chr_length: vec![512, 512], diff --git a/src/chimeric/score.rs b/src/chimeric/score.rs index ec87fe0..d5d1750 100644 --- a/src/chimeric/score.rs +++ b/src/chimeric/score.rs @@ -173,6 +173,7 @@ mod tests { Genome { sequence: seq, n_genome: 100, + n_genome_real: 100, n_chr_real: 1, chr_name: vec!["chr1".to_string()], chr_start: vec![0], diff --git a/src/genome/mod.rs b/src/genome/mod.rs index e3e616a..5d8d709 100644 --- a/src/genome/mod.rs +++ b/src/genome/mod.rs @@ -25,6 +25,12 @@ pub struct Genome { /// Total length of the forward (padded) genome. pub n_genome: u64, + /// Length of the real-genome forward strand, excluding any appended + /// Gsj splice-junction buffer. Equals `n_genome` before `append_sjdb` + /// and stays pinned at that value afterwards; matches STAR's + /// `mapGen.nGenomeReal`. + pub n_genome_real: u64, + /// Number of real chromosomes (not including scaffold/contigs if excluded). pub n_chr_real: usize, @@ -111,6 +117,7 @@ impl Genome { Ok(Genome { sequence, n_genome, + n_genome_real: n_genome, n_chr_real, chr_name, chr_length, diff --git a/src/index/io.rs b/src/index/io.rs index e8c7c41..12a65c6 100644 --- a/src/index/io.rs +++ b/src/index/io.rs @@ -11,6 +11,7 @@ use crate::index::packed_array::PackedArray; use crate::index::sa_index::SaIndex; use crate::index::suffix_array::SuffixArray; use crate::junction::SpliceJunctionDb; +use crate::junction::sjdb_insert; use crate::params::Parameters; use crate::quant::transcriptome::TranscriptomeIndex; @@ -99,13 +100,32 @@ impl GenomeIndex { ); } + // Reload prepared junctions + sjdbOverhang from sjdbInfo.txt when + // present. STAR appends a Gsj flanking-sequence buffer to the + // genome at build time; align-time code needs the parsed junctions + // to decode SA hits that land inside that buffer back to real + // `(donor, acceptor)` genome positions. + let sjdb_info_path = genome_dir.join("sjdbInfo.txt"); + let (prepared_junctions, sjdb_overhang) = if sjdb_info_path.exists() { + let tab = sjdb_insert::read_sjdb_info_tab(&sjdb_info_path, &genome)?; + log::info!( + "Loaded sjdbInfo.txt: {} junctions, sjdbOverhang={}", + tab.junctions.len(), + tab.sjdb_overhang, + ); + (tab.junctions, tab.sjdb_overhang) + } else { + (Vec::new(), 0) + }; + Ok(GenomeIndex { genome, suffix_array, sa_index, junction_db, transcriptome, - prepared_junctions: Vec::new(), + prepared_junctions, + sjdb_overhang, }) } } @@ -164,7 +184,8 @@ fn load_genome(genome_dir: &Path, _params: &Parameters) -> Result // (real + Gsj) lives in `genomeParameters.txt` under `genomeFileSizes`. // Prefer that value; fall back to the chr_start boundary for indices // built without a GTF. - let n_genome = read_genome_file_size(genome_dir)?.unwrap_or(chr_start[n_chr_real]); + let n_genome_real = chr_start[n_chr_real]; + let n_genome = read_genome_file_size(genome_dir)?.unwrap_or(n_genome_real); // Load Genome sequence file let genome_path = genome_dir.join("Genome"); @@ -192,6 +213,7 @@ fn load_genome(genome_dir: &Path, _params: &Parameters) -> Result Ok(Genome { sequence, n_genome, + n_genome_real, n_chr_real, chr_name, chr_length, diff --git a/src/index/mod.rs b/src/index/mod.rs index e84f26c..93ddd45 100644 --- a/src/index/mod.rs +++ b/src/index/mod.rs @@ -27,11 +27,16 @@ pub struct GenomeIndex { /// `transcriptInfo.tab` + friends at `genomeGenerate`, reloaded at /// `alignReads` from the same files. pub transcriptome: Option, - /// Prepared splice junctions (sorted/deduped) used only on the build - /// path to write `sjdbInfo.txt` + `sjdbList.out.tab`. Empty on the - /// load path — those files have already been written and are not - /// needed at align time. + /// Prepared splice junctions in their post-dedup order (the same + /// order they occupy in the Gsj buffer appended to the genome). + /// Populated on both build and load paths when sjdb is present; + /// empty for indices built without a GTF. Used at align time to + /// decode Gsj-region SA hits back to real-genome `(donor, acceptor)` + /// pairs. pub prepared_junctions: Vec, + /// `sjdbOverhang` recorded in `sjdbInfo.txt`. Zero when no sjdb + /// junctions are present. + pub sjdb_overhang: u32, } impl GenomeIndex { @@ -143,6 +148,12 @@ impl GenomeIndex { sa_index.data.len() ); + let sjdb_overhang = if prepared_junctions.is_empty() { + 0 + } else { + params.sjdb_overhang + }; + Ok(GenomeIndex { genome, suffix_array, @@ -150,6 +161,7 @@ impl GenomeIndex { junction_db, transcriptome, prepared_junctions, + sjdb_overhang, }) } diff --git a/src/io/bam.rs b/src/io/bam.rs index 2ac0517..60d3bbe 100644 --- a/src/io/bam.rs +++ b/src/io/bam.rs @@ -469,6 +469,7 @@ mod tests { Genome { sequence: vec![0, 1, 2, 3, 0, 1, 2, 3], // ACGTACGT n_genome: 8, + n_genome_real: 8, n_chr_real: 1, chr_name: vec!["chr1".to_string()], chr_length: vec![8], diff --git a/src/io/sam.rs b/src/io/sam.rs index 9e5cff5..026a8e8 100644 --- a/src/io/sam.rs +++ b/src/io/sam.rs @@ -1338,6 +1338,7 @@ mod tests { Genome { sequence: vec![0, 1, 2, 3, 0, 1, 2, 3], // ACGTACGT n_genome: 8, + n_genome_real: 8, n_chr_real: 1, chr_name: vec!["chr1".to_string()], chr_length: vec![8], diff --git a/src/junction/gtf.rs b/src/junction/gtf.rs index cdf98e0..73f7fef 100644 --- a/src/junction/gtf.rs +++ b/src/junction/gtf.rs @@ -316,6 +316,7 @@ mod tests { let genome = Genome { sequence: vec![0; 1000], n_genome: 1000, + n_genome_real: 1000, n_chr_real: 1, chr_start: vec![0, 1000], chr_length: vec![1000], @@ -367,6 +368,7 @@ mod tests { let genome = Genome { sequence: vec![0; 1000], n_genome: 1000, + n_genome_real: 1000, n_chr_real: 1, chr_start: vec![0, 1000], chr_length: vec![1000], @@ -445,6 +447,7 @@ mod tests { let genome = Genome { sequence: vec![0; 1000], n_genome: 1000, + n_genome_real: 1000, n_chr_real: 1, chr_start: vec![0, 1000], chr_length: vec![1000], @@ -476,6 +479,7 @@ mod tests { let genome = Genome { sequence: vec![0; 1000], n_genome: 1000, + n_genome_real: 1000, n_chr_real: 1, chr_start: vec![0, 1000], chr_length: vec![1000], @@ -522,6 +526,7 @@ mod tests { let genome = Genome { sequence: vec![0; 1000], n_genome: 1000, + n_genome_real: 1000, n_chr_real: 1, chr_start: vec![0, 1000], chr_length: vec![1000], @@ -605,6 +610,7 @@ mod tests { let genome = Genome { sequence: vec![0; 1000], n_genome: 1000, + n_genome_real: 1000, n_chr_real: 1, chr_start: vec![0, 1000], chr_length: vec![1000], diff --git a/src/junction/sj_output.rs b/src/junction/sj_output.rs index 472dbfc..acecd0c 100644 --- a/src/junction/sj_output.rs +++ b/src/junction/sj_output.rs @@ -516,6 +516,7 @@ mod tests { let genome = Genome { sequence: vec![0; 1000], n_genome: 1000, + n_genome_real: 1000, n_chr_real: 1, chr_start: vec![0, 1000], chr_length: vec![1000], @@ -577,6 +578,7 @@ mod tests { let genome = Genome { sequence: vec![0; 1000], n_genome: 1000, + n_genome_real: 1000, n_chr_real: 1, chr_start: vec![0, 1000], chr_length: vec![1000], @@ -613,6 +615,7 @@ mod tests { let genome = Genome { sequence: vec![0; 1000], n_genome: 1000, + n_genome_real: 1000, n_chr_real: 1, chr_start: vec![0, 1000], chr_length: vec![1000], @@ -696,6 +699,7 @@ mod tests { let genome = Genome { sequence: vec![0; 1000], n_genome: 1000, + n_genome_real: 1000, n_chr_real: 1, chr_start: vec![0, 1000], chr_length: vec![1000], diff --git a/src/junction/sjdb_insert.rs b/src/junction/sjdb_insert.rs index 01b19c0..f2105fe 100644 --- a/src/junction/sjdb_insert.rs +++ b/src/junction/sjdb_insert.rs @@ -21,6 +21,191 @@ use crate::error::Error; use crate::genome::Genome; use crate::junction::encode_motif; +/// Parsed `sjdbInfo.txt` header + body. Reconstructs the junction order +/// STAR used when building the Gsj buffer so align-time code can decode +/// SA hits in the Gsj region back to `(donor, acceptor)` genome positions. +#[derive(Debug, Clone)] +pub struct SjdbInfoTab { + pub sjdb_overhang: u32, + pub junctions: Vec, +} + +/// Parse `sjdbInfo.txt` and rebuild `PreparedJunction` entries in their +/// post-dedup order (the same order they occupy in the Gsj buffer). +/// +/// `chr_idx` is recovered by locating `original_start` in +/// `genome.chr_start[..n_chr_real]`. Positions falling in chromosome +/// padding return an `Error::Index`. +pub fn read_sjdb_info_tab(path: &Path, genome: &Genome) -> Result { + let contents = std::fs::read_to_string(path).map_err(|e| Error::io(e, path))?; + let mut lines = contents.lines(); + + let header = lines + .next() + .ok_or_else(|| Error::Index(format!("{}: empty sjdbInfo.txt", path.display())))?; + let mut hdr_iter = header.split('\t'); + let n_expected: usize = hdr_iter + .next() + .and_then(|s| s.parse().ok()) + .ok_or_else(|| Error::Index(format!("{}: bad header count", path.display())))?; + let sjdb_overhang: u32 = hdr_iter + .next() + .and_then(|s| s.parse().ok()) + .ok_or_else(|| Error::Index(format!("{}: bad header overhang", path.display())))?; + + let mut junctions = Vec::with_capacity(n_expected); + for (i, line) in lines.enumerate() { + if line.is_empty() { + continue; + } + let mut f = line.split('\t'); + let stored_start: u64 = f.next().and_then(|s| s.parse().ok()).ok_or_else(|| { + Error::Index(format!("{}: bad stored_start at row {}", path.display(), i)) + })?; + let stored_end: u64 = f.next().and_then(|s| s.parse().ok()).ok_or_else(|| { + Error::Index(format!("{}: bad stored_end at row {}", path.display(), i)) + })?; + let motif: u8 = f + .next() + .and_then(|s| s.parse().ok()) + .ok_or_else(|| Error::Index(format!("{}: bad motif at row {}", path.display(), i)))?; + let shift_left: u8 = f.next().and_then(|s| s.parse().ok()).ok_or_else(|| { + Error::Index(format!("{}: bad shift_left at row {}", path.display(), i)) + })?; + let shift_right: u8 = f.next().and_then(|s| s.parse().ok()).ok_or_else(|| { + Error::Index(format!("{}: bad shift_right at row {}", path.display(), i)) + })?; + let strand: u8 = f + .next() + .and_then(|s| s.parse().ok()) + .ok_or_else(|| Error::Index(format!("{}: bad strand at row {}", path.display(), i)))?; + + // stored_start = canonical: original_start; non-canonical: shifted = original_start - shift_left. + // Reverse the same mapping for start_pos (the shifted form). + let (start_pos, end_pos) = if motif == 0 { + (stored_start, stored_end) + } else { + ( + stored_start.saturating_sub(shift_left as u64), + stored_end.saturating_sub(shift_left as u64), + ) + }; + + let original_start = start_pos + shift_left as u64; + let chr_idx = genome + .chr_start + .iter() + .take(genome.n_chr_real) + .rposition(|&s| s <= original_start) + .ok_or_else(|| { + Error::Index(format!( + "{}: junction at {} precedes first chromosome", + path.display(), + original_start + )) + })?; + + junctions.push(PreparedJunction { + chr_idx, + start_pos, + end_pos, + motif, + shift_left, + shift_right, + strand, + }); + } + + if junctions.len() != n_expected { + return Err(Error::Index(format!( + "{}: header expected {} junctions, found {}", + path.display(), + n_expected, + junctions.len() + ))); + } + + Ok(SjdbInfoTab { + sjdb_overhang, + junctions, + }) +} + +/// Decode a single SA-hit position that falls in the Gsj flanking-sequence +/// buffer back to one or two real-genome positions, mirroring the layout +/// STAR builds in `build_gsj`. +/// +/// `fwd_pos` is a forward-strand genome position (`[0, n_genome)`). If +/// `fwd_pos < n_genome_real` the hit is on the real genome — the caller +/// should use the original `(fwd_pos, length)` directly and not invoke +/// this function. +/// +/// Returns a vector of `(real_fwd_pos, read_offset, length)` tuples: +/// - **One** entry when the hit lies entirely within a single junction's +/// donor flank or acceptor flank — the read↔genome correspondence is a +/// single contiguous run with `read_offset = 0` and `length` unchanged. +/// - **Two** entries when the hit crosses the donor/acceptor boundary at +/// `slot_offset == sjdb_overhang`: the first run covers the donor side, +/// the second covers the acceptor side (with `read_offset` advanced by +/// the donor-side length). The two runs together describe the same +/// read window but at non-adjacent genome positions — the downstream +/// stitch DP can then chain them via its splice branch. +/// +/// Returns an empty vector when the hit's Gsj junction index is out of +/// bounds, when the hit's slot offset lands on the trailing spacer byte, +/// or when the hit would extend past the spacer. +pub fn decode_gsj_hit( + fwd_pos: u64, + length: usize, + n_genome_real: u64, + sjdb_overhang: u32, + junctions: &[PreparedJunction], +) -> Vec<(u64, usize, usize)> { + if fwd_pos < n_genome_real { + return Vec::new(); + } + let overhang = sjdb_overhang as u64; + let sjdb_length = 2 * overhang + 1; + let rel = fwd_pos - n_genome_real; + let junction_idx = (rel / sjdb_length) as usize; + let slot_offset = rel % sjdb_length; + + if junction_idx >= junctions.len() { + return Vec::new(); + } + let pj = &junctions[junction_idx]; + + if slot_offset >= 2 * overhang { + return Vec::new(); + } + if length == 0 { + return Vec::new(); + } + if slot_offset + length as u64 > 2 * overhang { + return Vec::new(); + } + + let donor_genome_start = pj.original_start().saturating_sub(overhang); + let acceptor_genome_start = pj.original_end() + 1; + + if slot_offset + length as u64 <= overhang || slot_offset >= overhang { + // Single-flank hit: the matching bases live in the corresponding + // real-genome flank too, so the same SA range already supplies an + // entry at the equivalent real-genome position. Re-emitting it + // here would just create duplicate work for the cluster dedup. + Vec::new() + } else { + let donor_len = (overhang - slot_offset) as usize; + let acceptor_len = length - donor_len; + let donor_real = donor_genome_start + slot_offset; + let acceptor_real = acceptor_genome_start; + vec![ + (donor_real, 0, donor_len), + (acceptor_real, donor_len, acceptor_len), + ] + } +} + /// STAR's inter-SJ spacer byte in the Gsj buffer (same value STAR uses /// for inter-chromosome padding — `IncludeDefine.h::GENOME_spacingChar`). const GSJ_SPACING: u8 = 5; @@ -386,6 +571,7 @@ mod tests { Genome { sequence: seq, n_genome: n, + n_genome_real: n, n_chr_real: 1, chr_name: vec!["chr1".to_string()], chr_length: vec![n], @@ -790,6 +976,7 @@ mod tests { let genome = Genome { sequence: seq, n_genome: 2000, + n_genome_real: 2000, n_chr_real: 2, chr_name: vec!["chrA".to_string(), "chrB".to_string()], chr_length: vec![1000, 1000], @@ -824,4 +1011,140 @@ mod tests { // `old_wrong` is on wrong strand for its motif — STAR replaces. assert_eq!(out[0], new_right); } + + #[test] + fn decode_gsj_hit_outside_buffer_returns_empty() { + let junctions = vec![pj(0, 1000, 2000, 1, 0, 1)]; + // Hit in the real genome: caller should bypass this function. + assert!(decode_gsj_hit(500, 50, 5000, 100, &junctions).is_empty()); + } + + #[test] + fn decode_gsj_hit_single_flank_is_dropped() { + // sjdb_length = 2*100 + 1 = 201. Junction 0 occupies bytes + // [n_genome_real .. n_genome_real + 201). A hit at slot_offset 5 + // with length 20 sits entirely in the donor flank — the same SA + // range already includes a real-genome entry there, so the decoder + // returns no work to avoid duplicates. + let junctions = vec![pj(0, 1000, 2000, 1, 0, 1)]; + let n_genome_real = 5000; + let overhang = 100u32; + let hit_pos = n_genome_real + 5; + assert!(decode_gsj_hit(hit_pos, 20, n_genome_real, overhang, &junctions).is_empty()); + // Acceptor-only hit (slot_offset >= overhang) is also dropped. + let hit_pos2 = n_genome_real + 120; + assert!(decode_gsj_hit(hit_pos2, 20, n_genome_real, overhang, &junctions).is_empty()); + } + + #[test] + fn decode_gsj_hit_boundary_crossing_splits_into_two() { + // sjdb_length=21 (overhang 10), one canonical junction at + // original [1000..2000]. Donor flank covers forward genome + // [990..1000), acceptor flank covers [2001..2011). A hit at + // slot_offset 6 with length 12 straddles the boundary: 4 donor + // bytes, 8 acceptor bytes. + let junctions = vec![pj(0, 1000, 2000, 1, 0, 1)]; + let n_genome_real = 5000; + let overhang = 10u32; + let hit_pos = n_genome_real + 6; + let out = decode_gsj_hit(hit_pos, 12, n_genome_real, overhang, &junctions); + assert_eq!(out.len(), 2); + // Donor: real_fwd = 990 + 6 = 996, read_offset = 0, sub_len = 4. + assert_eq!(out[0], (996, 0, 4)); + // Acceptor: real_fwd = 2001, read_offset = 4, sub_len = 8. + assert_eq!(out[1], (2001, 4, 8)); + } + + #[test] + fn decode_gsj_hit_second_junction_offset() { + // Two junctions, overhang=10 → sjdb_length=21. Junction 1 begins + // at n_genome_real + 21. A boundary-crossing hit there must use + // junction 1's original coords, not junction 0's. + let junctions = vec![pj(0, 1000, 2000, 1, 0, 1), pj(0, 3000, 4000, 1, 0, 1)]; + let n_genome_real = 5000; + let overhang = 10u32; + let hit_pos = n_genome_real + 21 + 6; + let out = decode_gsj_hit(hit_pos, 12, n_genome_real, overhang, &junctions); + assert_eq!(out.len(), 2); + assert_eq!(out[0].0, 2996); // donor flank of J1 starts at 2990 + assert_eq!(out[1].0, 4001); // acceptor flank of J1 starts at 4001 + } + + #[test] + fn decode_gsj_hit_uses_original_for_noncanonical() { + // Non-canonical motif with shift_left=3: stored coords sit 3bp + // left of original. The Gsj buffer was built from ORIGINAL coords, + // so the decoder must un-shift back to original_start when + // computing donor/acceptor positions. + let junctions = vec![pj(0, 1000, 2000, 0, 3, 0)]; + let n_genome_real = 5000; + let overhang = 10u32; + // Boundary-crossing hit at slot_offset 5, length 10 (5 donor + 5 acceptor). + let hit_pos = n_genome_real + 5; + let out = decode_gsj_hit(hit_pos, 10, n_genome_real, overhang, &junctions); + assert_eq!(out.len(), 2); + // original_start = 1003 → donor flank [993..1003). Hit at slot 5 = 993+5 = 998. + assert_eq!(out[0], (998, 0, 5)); + // original_end = 2003 → acceptor flank starts at 2004. + assert_eq!(out[1], (2004, 5, 5)); + } + + #[test] + fn decode_gsj_hit_oob_junction_or_spacer_returns_empty() { + let junctions = vec![pj(0, 1000, 2000, 1, 0, 1)]; + let n_genome_real = 5000; + let overhang = 10u32; + // Past last junction. + assert!( + decode_gsj_hit(n_genome_real + 21, 5, n_genome_real, overhang, &junctions).is_empty() + ); + // On the trailing spacer byte (slot_offset == 2*overhang). + assert!( + decode_gsj_hit(n_genome_real + 20, 1, n_genome_real, overhang, &junctions).is_empty() + ); + // Hit extends past the spacer. + assert!( + decode_gsj_hit(n_genome_real + 19, 3, n_genome_real, overhang, &junctions).is_empty() + ); + } + + #[test] + fn read_sjdb_info_tab_roundtrip() { + let mut seq = vec![5u8; 4000]; + seq[..2000].copy_from_slice(&vec![0u8; 2000]); + let genome = Genome { + sequence: seq, + n_genome: 2000, + n_genome_real: 2000, + n_chr_real: 1, + chr_name: vec!["I".to_string()], + chr_length: vec![1000], + chr_start: vec![0, 1000], + }; + let junctions = vec![ + PreparedJunction { + chr_idx: 0, + start_pos: 200, + end_pos: 500, + motif: 1, + shift_left: 0, + shift_right: 1, + strand: 1, + }, + PreparedJunction { + chr_idx: 0, + start_pos: 700, + end_pos: 800, + motif: 0, + shift_left: 3, + shift_right: 0, + strand: 0, + }, + ]; + let tmp = tempfile::NamedTempFile::new().unwrap(); + write_sjdb_info_tab(tmp.path(), &junctions, 100).unwrap(); + let parsed = read_sjdb_info_tab(tmp.path(), &genome).unwrap(); + assert_eq!(parsed.sjdb_overhang, 100); + assert_eq!(parsed.junctions, junctions); + } } diff --git a/src/quant/mod.rs b/src/quant/mod.rs index 9516d99..689c2a7 100644 --- a/src/quant/mod.rs +++ b/src/quant/mod.rs @@ -384,6 +384,7 @@ mod tests { Genome { sequence: vec![0u8; 2000], n_genome: 2000, + n_genome_real: 2000, n_chr_real: 2, chr_start: vec![0, 1000, 2000], chr_length: vec![1000, 1000], diff --git a/src/quant/transcriptome.rs b/src/quant/transcriptome.rs index fc8ad26..b3e1f26 100644 --- a/src/quant/transcriptome.rs +++ b/src/quant/transcriptome.rs @@ -1391,6 +1391,7 @@ mod tests { Genome { sequence: vec![0u8; 3000], n_genome: 3000, + n_genome_real: 3000, n_chr_real: 2, chr_start: vec![0, 1000, 3000], chr_length: vec![1000, 2000], @@ -2269,6 +2270,7 @@ mod tests { let genome = Genome { sequence: seq, n_genome: 1000, + n_genome_real: 1000, n_chr_real: 1, chr_start: vec![0, 1000], chr_length: vec![1000], @@ -2324,6 +2326,7 @@ mod tests { let genome = Genome { sequence: seq, n_genome: 1000, + n_genome_real: 1000, n_chr_real: 1, chr_start: vec![0, 1000], chr_length: vec![1000],