From 6724959b3c445095a0e6e8ccc545776791411d02 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 12 May 2026 18:38:27 +0100 Subject: [PATCH 1/2] fix(sjdb): key SpliceJunctionDb in genome-absolute 0-based coordinates The DB was keyed in chromosome-local 1-based coords (straight from the GTF) but consulted in genome-absolute 0-based at stitch time, so every sjdb lookup during alignment missed. Annotated splice events never got the sjdb_score bonus and the stricter overhang gate fired in their place, dropping ~50 % of GT/AG splices on the test profile. The stats-time site queried in genome-absolute 1-based-equivalent, so it accidentally matched on chr 0 (chr_start=0) but missed on every other chromosome -- producing inconsistent answers between SJ.out.tab and Log.final.out on the same BAM. Normalise the DB to genome-absolute 0-based at construction, matching the convention prepared_junctions and SpliceJunctionStats already use. Update the stats-time call site to query in the new space. Stitch-time needs no change. Fixes #27 Co-Authored-By: Claude --- src/index/mod.rs | 13 ++------ src/junction/gtf.rs | 40 ++++++++++++----------- src/junction/mod.rs | 69 ++++++++++++++++++++++++++++++++++++--- src/junction/sj_output.rs | 26 ++++++++------- src/lib.rs | 8 ++--- 5 files changed, 108 insertions(+), 48 deletions(-) diff --git a/src/index/mod.rs b/src/index/mod.rs index e84f26c..1448071 100644 --- a/src/index/mod.rs +++ b/src/index/mod.rs @@ -83,20 +83,13 @@ impl GenomeIndex { log::info!("Extracted {} annotated junctions from GTF", raw.len()); let jdb = SpliceJunctionDb::from_raw_junctions(&raw); - // `extract_junctions_from_exons` returns chromosome-local 1-based - // intron coordinates (matching STAR's `sjdbList.fromGTF.out.tab`). - // `prepare_junction` + `sjdbInfo.txt` expect 0-based absolute - // genome offsets (matching STAR's `sjdbPrepare.cpp` and - // `detect_splice_motif`'s `genome.sequence[donor_pos]` access), - // so convert here. let prepared: Vec = raw .iter() - .map(|&(chr_idx, start_local_1b, end_local_1b, strand)| { - let chr_off = genome.chr_start[chr_idx]; + .map(|&(chr_idx, intron_start, intron_end, strand)| { sjdb_insert::prepare_junction( chr_idx, - chr_off + start_local_1b - 1, - chr_off + end_local_1b - 1, + intron_start, + intron_end, strand, &genome, n_genome_real, diff --git a/src/junction/gtf.rs b/src/junction/gtf.rs index cdf98e0..5a8d63a 100644 --- a/src/junction/gtf.rs +++ b/src/junction/gtf.rs @@ -150,7 +150,10 @@ fn parse_attributes(attr_str: &str) -> Result, Error> { /// `transcript_tag` is the GTF attribute key for the parent transcript /// (STAR: `sjdbGTFtagExonParentTranscript`, default `"transcript_id"`). /// -/// Returns: Vec<(chr_idx, intron_start, intron_end, strand)> +/// Returns: Vec<(chr_idx, intron_start, intron_end, strand)> where +/// `intron_start` / `intron_end` are genome-absolute 0-based positions +/// of the first and last intronic bases (matching the convention used +/// by `PreparedJunction` and the rest of the alignment pipeline). pub fn extract_junctions_configured( exons: Vec, genome: &Genome, @@ -199,25 +202,27 @@ pub fn extract_junctions_configured( // Sort exons by position exons.sort_by_key(|e| e.start); - // Calculate junction coordinates from consecutive exons + let chr_off = genome.chr_start[chr_idx]; + for i in 0..exons.len() - 1 { let exon1 = &exons[i]; let exon2 = &exons[i + 1]; - // Intron coordinates (1-based, STAR convention) - let intron_start = exon1.end + 1; - let intron_end = exon2.start - 1; + let intron_start_local_1b = exon1.end + 1; + let intron_end_local_1b = exon2.start.saturating_sub(1); - // Validate junction - if intron_end <= intron_start { + if intron_end_local_1b <= intron_start_local_1b { log::warn!( "Invalid junction coordinates: {}-{} (possibly overlapping exons)", - intron_start, - intron_end + intron_start_local_1b, + intron_end_local_1b ); continue; } + let intron_start = chr_off + intron_start_local_1b - 1; + let intron_end = chr_off + intron_end_local_1b - 1; + junctions.push((chr_idx, intron_start, intron_end, strand)); } } @@ -357,9 +362,9 @@ mod tests { assert_eq!(junctions.len(), 1); let (chr_idx, start, end, strand) = junctions[0]; assert_eq!(chr_idx, 0); - assert_eq!(start, 201); // exon1.end + 1 - assert_eq!(end, 299); // exon2.start - 1 - assert_eq!(strand, 1); // + strand + assert_eq!(start, 200); + assert_eq!(end, 298); + assert_eq!(strand, 1); } #[test] @@ -433,11 +438,8 @@ mod tests { let junctions = extract_junctions_from_exons(exons, &genome).unwrap(); - // Should have 1 unique junction (both transcripts share junction 201-299) - // Note: T1 has 100-200, 300-400 and T2 has 100-200, 300-500 - // They share the junction from exon ending at 200 to exon starting at 300 assert_eq!(junctions.len(), 1); - assert_eq!(junctions[0], (0, 201, 299, 1)); // chr0, junction 201-299, strand + + assert_eq!(junctions[0], (0, 200, 298, 1)); } #[test] @@ -563,8 +565,8 @@ mod tests { assert_eq!(junctions.len(), 1); let (_chr_idx, start, end, _strand) = junctions[0]; - assert_eq!(start, 201); - assert_eq!(end, 299); + assert_eq!(start, 200); + assert_eq!(end, 298); } #[test] @@ -642,6 +644,6 @@ mod tests { let junctions = extract_junctions_configured(exons, &genome, "Parent").unwrap(); assert_eq!(junctions.len(), 1); - assert_eq!(junctions[0], (0, 201, 299, 1)); + assert_eq!(junctions[0], (0, 200, 298, 1)); } } diff --git a/src/junction/mod.rs b/src/junction/mod.rs index 2f45cc3..7a90eac 100644 --- a/src/junction/mod.rs +++ b/src/junction/mod.rs @@ -19,7 +19,12 @@ use crate::genome::Genome; use std::collections::HashMap; use std::path::Path; -/// Key for junction lookup: (chr_idx, intron_start, intron_end, strand) +/// Key for junction lookup: (chr_idx, intron_start, intron_end, strand). +/// +/// `intron_start` / `intron_end` are genome-absolute 0-based positions of +/// the first and last intronic bases, matching the convention used by +/// `PreparedJunction`, `SpliceJunctionStats`, and the alignment-time +/// `genome_pos` variables. #[derive(Hash, Eq, PartialEq, Clone, Debug)] struct JunctionKey { chr_idx: usize, @@ -102,12 +107,12 @@ impl SpliceJunctionDb { Self { junctions } } - /// Check if a junction is annotated in the GTF + /// Check if a junction is annotated in the GTF. /// /// # Arguments /// * `chr_idx` - Chromosome index - /// * `start` - Intron start position (last exon base + 1) - /// * `end` - Intron end position (first exon base of next exon - 1) + /// * `start` - Genome-absolute 0-based position of the first intronic base + /// * `end` - Genome-absolute 0-based position of the last intronic base /// * `strand` - Strand (0=unknown, 1=+, 2=-) /// /// # Returns @@ -397,4 +402,60 @@ mod tests { assert_eq!(novel_junctions.len(), 1); assert_eq!(novel_junctions[0].0.intron_start, 300); } + + #[test] + fn test_db_keyed_in_genome_absolute_zero_based_multi_chr() { + use crate::junction::gtf::{GtfRecord, extract_junctions_configured}; + use std::collections::HashMap; + + // Two-chromosome toy genome so chr_start[1] != 0. + let genome = Genome { + sequence: vec![0; 4000], + n_genome: 2000, + n_chr_real: 2, + chr_start: vec![0, 1000, 2000], + chr_length: vec![1000, 1000], + chr_name: vec!["chr1".to_string(), "chr2".to_string()], + }; + + let make_exon = |seqname: &str, start: u64, end: u64, transcript: &str| -> GtfRecord { + let mut attrs: HashMap = HashMap::new(); + attrs.insert("gene_id".to_string(), "G".to_string()); + attrs.insert("transcript_id".to_string(), transcript.to_string()); + GtfRecord { + seqname: seqname.to_string(), + feature: "exon".to_string(), + start, + end, + strand: '+', + attributes: attrs, + } + }; + + let exons = vec![ + make_exon("chr1", 100, 200, "T1"), + make_exon("chr1", 300, 400, "T1"), + make_exon("chr2", 100, 200, "T2"), + make_exon("chr2", 300, 400, "T2"), + ]; + + let raw = extract_junctions_configured(exons, &genome, "transcript_id").unwrap(); + let db = SpliceJunctionDb::from_raw_junctions(&raw); + assert_eq!(db.len(), 2); + + // Junction on chr1: intron local 1-based 201..299 + // → genome-absolute 0-based: chr_start[0] + 200 .. chr_start[0] + 298 + assert!(db.is_annotated(0, 200, 298, 1)); + // Off-by-one in either direction must miss. + assert!(!db.is_annotated(0, 201, 299, 1)); + assert!(!db.is_annotated(0, 199, 297, 1)); + + // Junction on chr2: same chr-local coords, but chr_start[1] = 1000 + // → genome-absolute 0-based: 1200 .. 1298 + assert!(db.is_annotated(1, 1200, 1298, 1)); + // The pre-fix chr-local 1-based key (201, 299) must not match on chr2. + assert!(!db.is_annotated(1, 201, 299, 1)); + // The pre-fix stitch-time off-by-one (1199, 1297) must not match either. + assert!(!db.is_annotated(1, 1199, 1297, 1)); + } } diff --git a/src/junction/sj_output.rs b/src/junction/sj_output.rs index 472dbfc..9dccdbe 100644 --- a/src/junction/sj_output.rs +++ b/src/junction/sj_output.rs @@ -21,7 +21,11 @@ use std::io::{BufWriter, Write}; use std::path::Path; use std::sync::atomic::{AtomicU32, Ordering}; -/// Key for junction statistics +/// Key for junction statistics. +/// +/// `intron_start` / `intron_end` are genome-absolute 0-based positions of +/// the first and last intronic bases, matching `SpliceJunctionDb` and +/// `PreparedJunction`. #[derive(Hash, Eq, PartialEq, Clone, Debug)] pub(crate) struct SjKey { pub chr_idx: usize, @@ -74,12 +78,12 @@ impl SpliceJunctionStats { } } - /// Record a junction from alignment (thread-safe) + /// Record a junction from alignment (thread-safe). /// /// # Arguments /// * `chr_idx` - Chromosome index - /// * `start` - Intron start (1-based) - /// * `end` - Intron end (1-based) + /// * `start` - Genome-absolute 0-based position of the first intronic base + /// * `end` - Genome-absolute 0-based position of the last intronic base /// * `strand` - Strand (0=unknown, 1=+, 2=-) /// * `motif` - Splice motif /// * `is_unique` - True if from unique mapping, false if multi-mapping @@ -260,8 +264,8 @@ impl SpliceJunctionStats { .ok_or_else(|| Error::Index("Invalid chromosome index in junction".to_string()))?; let chr_start_pos = genome.chr_start[key.chr_idx]; - let chr_pos_start = key.intron_start - chr_start_pos; - let chr_pos_end = key.intron_end - chr_start_pos; + let chr_pos_start = key.intron_start - chr_start_pos + 1; + let chr_pos_end = key.intron_end - chr_start_pos + 1; writeln!( writer, @@ -538,8 +542,8 @@ mod tests { // First junction (sorted by position) let fields1: Vec<&str> = lines[0].split('\t').collect(); assert_eq!(fields1[0], "chr1"); // chr - assert_eq!(fields1[1], "100"); // start - assert_eq!(fields1[2], "200"); // end + assert_eq!(fields1[1], "101"); // chr-local 1-based start + assert_eq!(fields1[2], "201"); // chr-local 1-based end assert_eq!(fields1[3], "1"); // strand assert_eq!(fields1[4], "1"); // motif (GT/AG) assert_eq!(fields1[5], "0"); // not annotated @@ -550,8 +554,8 @@ mod tests { // Second junction (annotated, bypasses filters) let fields2: Vec<&str> = lines[1].split('\t').collect(); assert_eq!(fields2[0], "chr1"); - assert_eq!(fields2[1], "300"); - assert_eq!(fields2[2], "400"); + assert_eq!(fields2[1], "301"); + assert_eq!(fields2[2], "401"); assert_eq!(fields2[3], "2"); // - strand assert_eq!(fields2[4], "3"); // motif (GC/AG, direct encoding) assert_eq!(fields2[5], "1"); // annotated @@ -597,7 +601,7 @@ mod tests { // Canonical (overhang 20 >= 12) should pass assert_eq!(lines.len(), 1); let fields: Vec<&str> = lines[0].split('\t').collect(); - assert_eq!(fields[1], "300"); // Only the canonical junction remains + assert_eq!(fields[1], "301"); // Only the canonical junction remains } #[test] diff --git a/src/lib.rs b/src/lib.rs index d229763..c755bfa 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -793,8 +793,8 @@ fn extract_junction_keys( match op { CigarOp::RefSkip(len) => { let intron_len = *len; - let intron_start = genome_pos + 1; - let intron_end = genome_pos + intron_len as u64; + let intron_start = genome_pos; + let intron_end = genome_pos + intron_len as u64 - 1; let motif = scorer.detect_splice_motif(genome_pos, intron_len, &index.genome); let strand = match motif.implied_strand() { @@ -1857,8 +1857,8 @@ fn record_transcript_junctions( CigarOp::RefSkip(len) => { // This is a splice junction let intron_len = *len; - let intron_start = genome_pos + 1; // 1-based, first intronic base - let intron_end = genome_pos + intron_len as u64; // 1-based, last intronic base + let intron_start = genome_pos; + let intron_end = genome_pos + intron_len as u64 - 1; // Detect splice motif let motif = scorer.detect_splice_motif(genome_pos, intron_len, &index.genome); From ac09650d0ffa774293bf45e47207a5f553409f10 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 12 May 2026 19:01:18 +0100 Subject: [PATCH 2/2] fix(sjdb): off-by-one on stitch-time acceptor coordinate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous commit normalised the DB to genome-absolute 0-based and updated the stats-time query. Stitch-time was left unchanged on the assumption that it already passed genome-absolute 0-based — half right. donor_fwd was correct (first intron base, 0-based) but acceptor_fwd = donor_fwd + del landed on the first base AFTER the intron, while the DB keys store the last intron base. Lookup missed every annotated junction. Subtract 1 from the acceptor to land on the last intron base. After this, Log.final.out Annotated (sjdb) goes from 0 to a non-zero count that's consistent with the annotated=1 row count in SJ.out.tab on the same BAM. Co-Authored-By: Claude --- src/align/stitch.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/align/stitch.rs b/src/align/stitch.rs index 5c78d87..e003d36 100644 --- a/src/align/stitch.rs +++ b/src/align/stitch.rs @@ -1302,12 +1302,11 @@ fn stitch_align_to_transcript( return None; } - // Check annotation (needed for sjdbScore bonus and finalization check) let is_annotated = junction_db.is_some_and(|db| { let junc_donor_sa = (donor_sa as i64 + jr_shift as i64) as u64; let donor_fwd = index.sa_pos_to_forward(junc_donor_sa, cluster.is_reverse, del as usize); - let acceptor_fwd = donor_fwd + del as u64; + let acceptor_fwd = donor_fwd + del as u64 - 1; db.is_annotated(cluster.chr_idx, donor_fwd, acceptor_fwd, 0) || db.is_annotated(cluster.chr_idx, donor_fwd, acceptor_fwd, 1) || db.is_annotated(cluster.chr_idx, donor_fwd, acceptor_fwd, 2)