diff --git a/Cargo.toml b/Cargo.toml index a70b909..8828255 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,7 +11,7 @@ categories = ["science::bioinformatics", "encoding", "data-structures"] keywords = ["bioinformatics", "nucleotide", "sequencing", "genomics", "fastq"] [dependencies] -anyhow = "1.0.100" +anyhow = {version = "1.0.100", optional = true} auto_impl = "1.3.0" bitnuc = "0.4.0" bytemuck = { version = "1.24.0", features = ["derive", "extern_crate_alloc"] } @@ -20,17 +20,24 @@ itoa = "1.0.17" memchr = "2.7.6" memmap2 = "0.9.9" num_cpus = "1.17.0" +paraseq = { version = "0.4.8", optional = true } +parking_lot = {version = "0.12.5", optional = true } rand = { version = "0.9.2", features = ["small_rng"] } sucds = "0.8.3" thiserror = "2.0.17" zstd = { version = "0.13.3", features = ["zstdmt"] } [dev-dependencies] -parking_lot = "0.12.5" anyhow = "1.0.100" +parking_lot = "0.12.5" clap = { version = "4.5.54", features = ["derive"] } paraseq = "0.4.8" +[features] +default = ["paraseq", "anyhow"] +anyhow = ["dep:anyhow"] +paraseq = ["dep:paraseq", "dep:parking_lot"] + [lints.clippy] pedantic = { level = "warn", priority = -1 } cast_possible_truncation = "allow" diff --git a/examples/auto-write.rs b/examples/auto-write.rs new file mode 100644 index 0000000..1a57292 --- /dev/null +++ b/examples/auto-write.rs @@ -0,0 +1,122 @@ +use std::{fs::File, io::BufWriter}; + +use anyhow::Result; +use binseq::{BinseqWriterBuilder, write::Format}; +use bitnuc::BitSize; +use clap::Parser; + +type BoxedWriter = Box; + +#[derive(Parser)] +struct Args { + /// Input FASTX to encode into BINSEQ format + #[clap(required = true)] + input: String, + + /// Input FASTX to encode into BINSEQ format (R2) + #[clap(required = false)] + input2: Option, + + /// Output file path for BINSEQ format + #[clap(short = 'o', long)] + output: Option, + + /// Default prefix for writing BINSEQ: `.` + #[clap(short = 'p', long, default_value = "output")] + prefix: String, + + /// Format of the output BINSEQ file + /// + /// [bq: bq|BQ|b, vbq: vbq|VBQ|v, cbq: cbq|CBQ|c] + #[clap(short = 'f', long)] + format: Option, + + /// Exclude quality information in BINSEQ output + /// + /// (bq ignores quality always) + #[clap(short = 'Q', long)] + exclude_quality: bool, + + /// Exclude sequence headers in BINSEQ output + /// + /// (bq ignores headers always) + #[clap(short = 'H', long)] + exclude_headers: bool, + + /// Compression level for BINSEQ output (0: auto) + #[clap(long, default_value_t = 0)] + compression_level: i32, + + /// Default BITSIZE for BINSEQ output (2: 2bit, 4: 4bit) + #[clap(long, default_value_t = 2)] + bitsize: u8, + + /// Default BLOCKSIZE in KB for BINSEQ output (vbq,cbq) + #[clap(long, default_value_t = 128)] + blocksize: usize, + + /// Number of threads to use for parallel processing, 0: all available + #[clap(short = 'T', long, default_value = "0")] + threads: usize, +} +impl Args { + /// Determines the output format based on the file extension or the provided format + fn format(&self) -> Format { + if let Some(format) = self.format { + format + } else { + if let Some(output) = &self.output { + match output.split(".").last() { + Some("bq") => Format::Bq, + Some("vbq") => Format::Vbq, + Some("cbq") => Format::Cbq, + _ => Format::default(), + } + } else { + Format::default() + } + } + } + fn bitsize(&self) -> BitSize { + match self.bitsize { + 4 => BitSize::Four, + _ => BitSize::Two, + } + } + + /// Creates an output file handle + fn ohandle(&self) -> Result { + let path = if let Some(output) = &self.output { + output.to_string() + } else { + format!("{}{}", &self.prefix, self.format().extension()) + }; + let ofile = File::create(path).map(BufWriter::new)?; + Ok(Box::new(ofile)) + } + + fn is_paired(&self) -> bool { + self.input2.is_some() + } +} + +fn main() -> Result<()> { + let args = Args::parse(); + let handle = args.ohandle()?; + let builder = BinseqWriterBuilder::new(args.format()) + .bitsize(args.bitsize()) + .block_size(args.blocksize * 1024) + .headers(!args.exclude_headers) + .quality(!args.exclude_quality) + .compression_level(args.compression_level) + .encode_fastx(handle); + if args.is_paired() { + builder.input_paired(&args.input, args.input2.as_ref().unwrap()) + } else { + builder.input(&args.input) + } + .threads(args.threads) + .run()?; + + Ok(()) +} diff --git a/src/bq/writer.rs b/src/bq/writer.rs index 839094e..52b2935 100644 --- a/src/bq/writer.rs +++ b/src/bq/writer.rs @@ -475,7 +475,9 @@ impl BinseqWriter { write_flag(&mut self.inner, record.flag().unwrap_or(0))?; } - if record.is_paired() != self.encoder.header.is_paired() { + // Check paired status - writer can require paired (record must have R2), + // but if writer is single-end, we simply ignore any R2 data in the record. + if self.encoder.header.is_paired() && !record.is_paired() { return Err(WriteError::ConfigurationMismatch { attribute: "paired", expected: self.encoder.header.is_paired(), @@ -484,7 +486,7 @@ impl BinseqWriter { .into()); } - if record.is_paired() { + if self.encoder.header.is_paired() { if let Some((sbuffer, xbuffer)) = self .encoder .encode_paired(record.s_seq, record.x_seq.unwrap_or_default())? diff --git a/src/cbq/core/block.rs b/src/cbq/core/block.rs index a47a512..0931cf3 100644 --- a/src/cbq/core/block.rs +++ b/src/cbq/core/block.rs @@ -160,31 +160,76 @@ impl ColumnarBlock { self.nuclen = self.seq.len(); } - fn add_flag(&mut self, record: &SequencingRecord) { - if let Some(flag) = record.flag { + fn add_flag(&mut self, record: &SequencingRecord) -> Result<()> { + if self.header.has_flags() { + let Some(flag) = record.flag else { + return Err(WriteError::ConfigurationMismatch { + attribute: "flag", + expected: true, + actual: false, + } + .into()); + }; self.flags.push(flag); } + Ok(()) } - fn add_headers(&mut self, record: &SequencingRecord) { - if let Some(header) = record.s_header { - self.l_headers.push(header.len() as u64); - self.headers.extend_from_slice(header); - } - if let Some(header) = record.x_header { - self.l_headers.push(header.len() as u64); - self.headers.extend_from_slice(header); + fn add_headers(&mut self, record: &SequencingRecord) -> Result<()> { + if self.header.has_headers() { + let Some(sheader) = record.s_header else { + return Err(WriteError::ConfigurationMismatch { + attribute: "s_header", + expected: true, + actual: false, + } + .into()); + }; + self.l_headers.push(sheader.len() as u64); + self.headers.extend_from_slice(sheader); + + if self.header.is_paired() { + let Some(xheader) = record.x_header else { + return Err(WriteError::ConfigurationMismatch { + attribute: "x_header", + expected: true, + actual: false, + } + .into()); + }; + self.l_headers.push(xheader.len() as u64); + self.headers.extend_from_slice(xheader); + } } + Ok(()) } /// Note: this does not check if quality scores are different lengths from sequence - fn add_quality(&mut self, record: &SequencingRecord) { - if let Some(qual) = record.s_qual { - self.qual.extend_from_slice(qual); - } - if let Some(qual) = record.x_qual { - self.qual.extend_from_slice(qual); + fn add_quality(&mut self, record: &SequencingRecord) -> Result<()> { + if self.header.has_qualities() { + let Some(squal) = record.s_qual() else { + return Err(WriteError::ConfigurationMismatch { + attribute: "s_qual", + expected: true, + actual: false, + } + .into()); + }; + self.qual.extend_from_slice(squal); + + if self.header.is_paired() { + let Some(xqual) = record.x_qual() else { + return Err(WriteError::ConfigurationMismatch { + attribute: "x_qual", + expected: true, + actual: false, + } + .into()); + }; + self.qual.extend_from_slice(xqual); + } } + Ok(()) } /// Calculate the usage of the block as a percentage @@ -194,7 +239,13 @@ impl ColumnarBlock { } pub(crate) fn can_fit(&self, record: &SequencingRecord<'_>) -> bool { - self.current_size + record.size() <= self.header.block_size as usize + let configured_size = record.configured_size_cbq( + self.header.is_paired(), + self.header.has_flags(), + self.header.has_headers(), + self.header.has_qualities(), + ); + self.current_size + configured_size <= self.header.block_size as usize } pub(crate) fn can_ingest(&self, other: &Self) -> bool { @@ -203,23 +254,32 @@ impl ColumnarBlock { /// Ensure that the record can be pushed into the block fn validate_record(&self, record: &SequencingRecord) -> Result<()> { + let configured_size = record.configured_size_cbq( + self.header.is_paired(), + self.header.has_flags(), + self.header.has_headers(), + self.header.has_qualities(), + ); + if !self.can_fit(record) { - if record.size() > self.header.block_size as usize { + if configured_size > self.header.block_size as usize { return Err(WriteError::RecordSizeExceedsMaximumBlockSize( - record.size(), + configured_size, self.header.block_size as usize, ) .into()); } return Err(CbqError::BlockFull { current_size: self.current_size, - record_size: record.size(), + record_size: configured_size, block_size: self.header.block_size as usize, } .into()); } - if record.is_paired() != self.header.is_paired() { + // Check paired status - writer can require paired (record must have R2), + // but if writer is single-end, we simply ignore any R2 data in the record. + if self.header.is_paired() && !record.is_paired() { return Err(WriteError::ConfigurationMismatch { attribute: "paired", expected: self.header.is_paired(), @@ -228,7 +288,9 @@ impl ColumnarBlock { .into()); } - if record.has_flags() != self.header.has_flags() { + // For flags, headers, and qualities: the writer can require them (record must have them), + // but if the writer doesn't need them, we simply ignore any extra data in the record. + if self.header.has_flags() && !record.has_flags() { return Err(WriteError::ConfigurationMismatch { attribute: "flags", expected: self.header.has_flags(), @@ -237,7 +299,7 @@ impl ColumnarBlock { .into()); } - if record.has_headers() != self.header.has_headers() { + if self.header.has_headers() && !record.has_headers() { return Err(WriteError::ConfigurationMismatch { attribute: "headers", expected: self.header.has_headers(), @@ -246,7 +308,7 @@ impl ColumnarBlock { .into()); } - if record.has_qualities() != self.header.has_qualities() { + if self.header.has_qualities() && !record.has_qualities() { return Err(WriteError::ConfigurationMismatch { attribute: "qualities", expected: self.header.has_qualities(), @@ -260,11 +322,18 @@ impl ColumnarBlock { pub fn push(&mut self, record: SequencingRecord) -> Result<()> { self.validate_record(&record)?; + let configured_size = record.configured_size_cbq( + self.header.is_paired(), + self.header.has_flags(), + self.header.has_headers(), + self.header.has_qualities(), + ); + self.add_sequence(&record); - self.add_flag(&record); - self.add_headers(&record); - self.add_quality(&record); - self.current_size += record.size(); + self.add_flag(&record)?; + self.add_headers(&record)?; + self.add_quality(&record)?; + self.current_size += configured_size; self.num_records += 1; Ok(()) diff --git a/src/error.rs b/src/error.rs index 8b69357..9475a61 100644 --- a/src/error.rs +++ b/src/error.rs @@ -48,12 +48,17 @@ pub enum Error { BitnucError(#[from] bitnuc::Error), /// Conversion errors from anyhow errors + #[cfg(feature = "anyhow")] #[error("Generic error: {0}")] AnyhowError(#[from] anyhow::Error), /// Generic errors for other unexpected situations #[error("Generic error: {0}")] GenericError(#[from] Box), + + #[cfg(feature = "paraseq")] + #[error("Fastx encoding error: {0}")] + FastxEncodingError(#[from] FastxEncodingError), } impl Error { /// Checks if the error is an index mismatch error @@ -349,6 +354,16 @@ pub enum CbqError { MissingSequenceOnSequencingRecord, } +#[cfg(feature = "paraseq")] +#[derive(thiserror::Error, Debug)] +pub enum FastxEncodingError { + #[error("Empty FASTX file")] + EmptyFastxFile, + + #[error("Builder not provided with any input")] + MissingInput, +} + #[derive(thiserror::Error, Debug)] pub enum ExtensionError { /// When the extension is not supported diff --git a/src/lib.rs b/src/lib.rs index 0ac2049..41f58a0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -103,6 +103,9 @@ pub mod prelude; /// Write operations generic over the BINSEQ variant pub mod write; +/// Utilities for working with BINSEQ files +pub mod utils; + pub use error::{Error, IntoBinseqError, Result}; pub use parallel::{BinseqReader, ParallelProcessor, ParallelReader}; pub use policy::{Policy, RNG_SEED}; diff --git a/src/record/sequencing_record.rs b/src/record/sequencing_record.rs index d35a2d8..95b6fd0 100644 --- a/src/record/sequencing_record.rs +++ b/src/record/sequencing_record.rs @@ -1,4 +1,4 @@ -use crate::{Result, error::WriteError}; +use crate::{BitSize, Result, error::WriteError}; /// A zero-copy record used to write sequences to binary sequence files. /// @@ -101,17 +101,133 @@ impl<'a> SequencingRecord<'a> { self.flag } - /// Returns the size of the record in bytes (used for CBQ block capacity) + /// Returns the configured size of this record for CBQ format. + /// + /// CBQ uses columnar storage so there are no per-record length prefixes. + /// This calculates the size based on writer configuration, ignoring any + /// extra data in the record that the writer won't use. #[inline] #[must_use] - pub fn size(&self) -> usize { - (self.s_seq.len().div_ceil(4)) - + self.s_qual.map_or(0, <[u8]>::len) - + self.s_header.map_or(0, <[u8]>::len) - + self.x_seq.map_or(0, |q| q.len().div_ceil(4)) - + self.x_qual.map_or(0, <[u8]>::len) - + self.x_header.map_or(0, <[u8]>::len) - + self.flag.map_or(0, |f| f.to_le_bytes().len()) + pub fn configured_size_cbq( + &self, + is_paired: bool, + has_flags: bool, + has_headers: bool, + has_qualities: bool, + ) -> usize { + // CBQ uses 2-bit encoding: 4 nucleotides per byte, 32 per u64 word + const NUCS_PER_WORD: usize = 32; + + let mut size = 0; + + // Sequence size (encoded into u64 words) + let s_chunks = self.s_seq.len().div_ceil(NUCS_PER_WORD); + size += s_chunks * 8; + + // Extended sequence (only if writer is configured for paired) + if is_paired { + let x_chunks = self.x_seq.map_or(0, |x| x.len().div_ceil(NUCS_PER_WORD)); + size += x_chunks * 8; + } + + // Flag size (only if writer is configured for flags) + if has_flags { + size += 8; // u64 + } + + // Header size (only if writer is configured for headers) + if has_headers { + size += self.s_header.map_or(0, <[u8]>::len); + if is_paired { + size += self.x_header.map_or(0, <[u8]>::len); + } + } + + // Quality size (only if writer is configured for qualities) + if has_qualities { + size += self.s_qual.map_or(0, <[u8]>::len); + if is_paired { + size += self.x_qual.map_or(0, <[u8]>::len); + } + } + + size + } + + /// Returns the configured size of this record for VBQ format. + /// + /// VBQ uses a row-based format with length prefixes for each field. + /// This calculates the size based on writer configuration, ignoring any + /// extra data in the record that the writer won't use. + /// + /// The VBQ record layout is: + /// - Flag (8 bytes, if `has_flags`) + /// - `s_len` (8 bytes) + /// - `x_len` (8 bytes) + /// - `s_seq` (encoded, rounded up to 8-byte words) + /// - `s_qual` (raw bytes, if `has_qualities`) + /// - `s_header_len` + `s_header` (8 + len bytes, if `has_headers` and `s_header` present) + /// - `x_seq` (encoded, rounded up to 8-byte words, if paired) + /// - `x_qual` (raw bytes, if `has_qualities` and paired) + /// - `x_header_len` + `x_header` (8 + len bytes, if `has_headers` and `x_header` present) + #[inline] + #[must_use] + pub fn configured_size_vbq( + &self, + is_paired: bool, + has_flags: bool, + has_headers: bool, + has_qualities: bool, + bitsize: BitSize, + ) -> usize { + // Calculate how many nucleotides fit per byte for the given bitsize + let nucs_per_byte = if matches!(bitsize, BitSize::Two) { + 4 + } else { + 2 + }; + // VBQ packs sequences into u64 words + let nucs_per_word = nucs_per_byte * 8; + + let mut size = 0; + + // Length prefixes: s_len and x_len (always present) + size += 16; // 2 * u64 + + // Flag (8 bytes, if has_flags) + if has_flags { + size += 8; + } + + // Primary sequence (encoded into u64 words) + let s_chunks = self.s_seq.len().div_ceil(nucs_per_word); + size += s_chunks * 8; + + // Extended sequence (only if writer is configured for paired) + if is_paired { + let x_chunks = self.x_seq.map_or(0, |x| x.len().div_ceil(nucs_per_word)); + size += x_chunks * 8; + } + + // Quality scores (raw bytes, only if writer configured for qualities) + if has_qualities { + size += self.s_qual.map_or(0, <[u8]>::len); + if is_paired { + size += self.x_qual.map_or(0, <[u8]>::len); + } + } + + // Headers (length prefix + raw bytes, only if writer configured for headers) + if has_headers { + if let Some(h) = self.s_header { + size += 8 + h.len(); // length prefix + header bytes + } + if is_paired && let Some(h) = self.x_header { + size += 8 + h.len(); // length prefix + header bytes + } + } + + size } #[inline] diff --git a/src/utils/fastx.rs b/src/utils/fastx.rs new file mode 100644 index 0000000..14de9e1 --- /dev/null +++ b/src/utils/fastx.rs @@ -0,0 +1,444 @@ +//! FASTX encoding utilities for converting FASTX files to BINSEQ formats +//! +//! This module provides utilities for encoding FASTX (FASTA/FASTQ) files into +//! BINSEQ formats using parallel processing via the `paraseq` crate. + +use std::{ + io::{Read, Write}, + path::{Path, PathBuf}, + sync::Arc, +}; + +use paraseq::{ + Record, fastx, + prelude::{IntoProcessError, PairedParallelProcessor, ParallelProcessor, ParallelReader}, +}; +use parking_lot::Mutex; + +use crate::{ + BinseqWriter, BinseqWriterBuilder, IntoBinseqError, Result, SequencingRecordBuilder, + error::FastxEncodingError, +}; + +type BoxedRead = Box; +type BoxedWrite = Box; + +/// Input source for FASTX encoding +#[derive(Debug, Clone)] +enum FastxInput { + /// Read from stdin + Stdin, + /// Read from a single file + Single(PathBuf), + /// Read from paired files (R1, R2) + Paired(PathBuf, PathBuf), +} + +/// Builder for encoding FASTX files to BINSEQ format +/// +/// This builder is created by calling [`BinseqWriterBuilder::encode_fastx`] and +/// provides a fluent interface for configuring the input source and threading options. +/// +/// # Example +/// +/// ```rust,no_run +/// use binseq::write::{BinseqWriterBuilder, Format}; +/// use std::fs::File; +/// +/// // Encode from stdin to VBQ +/// let writer = BinseqWriterBuilder::new(Format::Vbq) +/// .quality(true) +/// .headers(true) +/// .encode_fastx(Box::new(File::create("output.vbq")?)) +/// .input_stdin() +/// .threads(8) +/// .run()?; +/// # Ok::<(), binseq::Error>(()) +/// ``` +pub struct FastxEncoderBuilder { + builder: BinseqWriterBuilder, + output: BoxedWrite, + input: Option, + threads: usize, +} + +impl FastxEncoderBuilder { + /// Create a new encoder builder + pub(crate) fn new(builder: BinseqWriterBuilder, output: BoxedWrite) -> Self { + Self { + builder, + output, + input: None, + threads: 0, // 0 means use all available cores + } + } + + /// Read from a single FASTX file + /// + /// # Example + /// + /// ```rust,no_run + /// # use binseq::write::{BinseqWriterBuilder, Format}; + /// # use std::fs::File; + /// BinseqWriterBuilder::new(Format::Vbq) + /// .encode_fastx(Box::new(File::create("output.vbq")?)) + /// .input("input.fastq") + /// .run()?; + /// # Ok::<(), binseq::Error>(()) + /// ``` + pub fn input>(mut self, path: P) -> Self { + self.input = Some(FastxInput::Single(path.as_ref().to_path_buf())); + self + } + + /// Read from stdin + /// + /// # Example + /// + /// ```rust,no_run + /// # use binseq::write::{BinseqWriterBuilder, Format}; + /// # use std::fs::File; + /// BinseqWriterBuilder::new(Format::Vbq) + /// .encode_fastx(Box::new(File::create("output.vbq")?)) + /// .input_stdin() + /// .run()?; + /// # Ok::<(), binseq::Error>(()) + /// ``` + #[must_use] + pub fn input_stdin(mut self) -> Self { + self.input = Some(FastxInput::Stdin); + self + } + + /// Read from paired FASTX files (R1, R2) + /// + /// This automatically sets the writer to paired mode. + /// + /// # Example + /// + /// ```rust,no_run + /// # use binseq::write::{BinseqWriterBuilder, Format}; + /// # use std::fs::File; + /// BinseqWriterBuilder::new(Format::Vbq) + /// .encode_fastx(Box::new(File::create("output.vbq")?)) + /// .input_paired("R1.fastq", "R2.fastq") + /// .run()?; + /// # Ok::<(), binseq::Error>(()) + /// ``` + pub fn input_paired>(mut self, r1: P, r2: P) -> Self { + self.input = Some(FastxInput::Paired( + r1.as_ref().to_path_buf(), + r2.as_ref().to_path_buf(), + )); + // Automatically set paired mode + self.builder = self.builder.paired(true); + self + } + + /// Set the number of threads for parallel processing + /// + /// If not set or set to 0, uses all available CPU cores. + /// + /// # Example + /// + /// ```rust,no_run + /// # use binseq::write::{BinseqWriterBuilder, Format}; + /// # use std::fs::File; + /// BinseqWriterBuilder::new(Format::Vbq) + /// .encode_fastx(Box::new(File::create("output.vbq")?)) + /// .input("input.fastq") + /// .threads(8) + /// .run()?; + /// # Ok::<(), binseq::Error>(()) + /// ``` + #[must_use] + pub fn threads(mut self, n: usize) -> Self { + self.threads = n; + self + } + + /// Execute the FASTX encoding + /// + /// This consumes the builder and returns a `BinseqWriter` that has been + /// populated with all records from the input FASTX file(s). + /// + /// # Errors + /// + /// Returns an error if: + /// - The input files cannot be read + /// - The FASTX format is invalid + /// - The writer configuration is incompatible with the input + /// - For BQ format with stdin input (cannot detect sequence length) + /// + /// # Example + /// + /// ```rust,no_run + /// # use binseq::write::{BinseqWriterBuilder, Format}; + /// # use std::fs::File; + /// let writer = BinseqWriterBuilder::new(Format::Vbq) + /// .encode_fastx(Box::new(File::create("output.vbq")?)) + /// .input("input.fastq") + /// .run()?; + /// # Ok::<(), binseq::Error>(()) + /// ``` + pub fn run(mut self) -> Result<()> { + let (r1, r2) = match self.input { + Some(FastxInput::Single(path)) => { + // build interleaved reader + let mut reader = + fastx::Reader::from_path(path).map_err(IntoBinseqError::into_binseq_error)?; + let (slen, xlen) = detect_seq_len(&mut reader, true)?; + self.builder = self.builder.slen(slen as u32).xlen(xlen as u32); + (reader, None) + } + Some(FastxInput::Stdin) => { + let mut reader = + fastx::Reader::from_stdin().map_err(IntoBinseqError::into_binseq_error)?; + let (slen, xlen) = detect_seq_len(&mut reader, true)?; + self.builder = self.builder.slen(slen as u32).xlen(xlen as u32); + (reader, None) + } + Some(FastxInput::Paired(path1, path2)) => { + // build interleaved reader + let mut reader1 = + fastx::Reader::from_path(path1).map_err(IntoBinseqError::into_binseq_error)?; + let mut reader2 = + fastx::Reader::from_path(path2).map_err(IntoBinseqError::into_binseq_error)?; + let (slen, _) = detect_seq_len(&mut reader1, false)?; + let (xlen, _) = detect_seq_len(&mut reader2, false)?; + self.builder = self.builder.slen(slen as u32).xlen(xlen as u32); + (reader1, Some(reader2)) + } + None => return Err(FastxEncodingError::MissingInput.into()), + }; + + let writer = self.builder.build(self.output)?; + if writer.is_paired() { + if let Some(r2) = r2 { + encode_paired(writer, r1, r2, self.threads)?; + } else { + encode_interleaved(writer, r1, self.threads)?; + } + } else { + encode_single_file(writer, r1, self.threads)?; + } + + Ok(()) + } +} + +/// Encode single-end reads from a file +fn encode_single_file( + writer: BinseqWriter, + reader: fastx::Reader, + threads: usize, +) -> Result<()> { + let mut encoder = Encoder::new(writer)?; + reader + .process_parallel(&mut encoder, threads) + .map_err(IntoBinseqError::into_binseq_error)?; + encoder.finish()?; + Ok(()) +} + +/// Encode paired-end reads from interleaved file +fn encode_interleaved( + writer: BinseqWriter, + reader: fastx::Reader, + threads: usize, +) -> Result<()> { + let mut encoder = Encoder::new(writer)?; + reader + .process_parallel_interleaved(&mut encoder, threads) + .map_err(IntoBinseqError::into_binseq_error)?; + encoder.finish()?; + Ok(()) +} + +/// Encode paired-end reads from files +fn encode_paired( + writer: BinseqWriter, + r1: fastx::Reader, + r2: fastx::Reader, + threads: usize, +) -> Result<()> { + let mut encoder = Encoder::new(writer)?; + r1.process_parallel_paired(r2, &mut encoder, threads) + .map_err(IntoBinseqError::into_binseq_error)?; + encoder.finish()?; + Ok(()) +} + +fn detect_seq_len( + reader: &mut fastx::Reader, + interleaved: bool, +) -> Result<(usize, usize)> { + // Initialze the record set + let mut rset = reader.new_record_set(); + rset.fill(reader) + .map_err(IntoBinseqError::into_binseq_error)?; + + let (slen, xlen) = if interleaved { + let mut rset_iter = rset.iter(); + let Some(Ok(slen)) = rset_iter.next().map(|r| -> Result { + let rec = r.map_err(IntoBinseqError::into_binseq_error)?; + Ok(rec.seq().len()) + }) else { + return Err(FastxEncodingError::EmptyFastxFile.into()); + }; + let Some(Ok(xlen)) = rset_iter.next().map(|r| -> Result { + let rec = r.map_err(IntoBinseqError::into_binseq_error)?; + Ok(rec.seq().len()) + }) else { + return Err(FastxEncodingError::EmptyFastxFile.into()); + }; + (slen, xlen) + } else { + let mut rset_iter = rset.iter(); + let Some(Ok(slen)) = rset_iter.next().map(|r| -> Result { + let rec = r.map_err(IntoBinseqError::into_binseq_error)?; + Ok(rec.seq().len()) + }) else { + return Err(FastxEncodingError::EmptyFastxFile.into()); + }; + (slen, 0) + }; + reader + .reload(&mut rset) + .map_err(IntoBinseqError::into_binseq_error)?; + Ok((slen, xlen)) +} + +/// Parallel encoder for FASTX records to BINSEQ format +/// +/// This struct implements the `ParallelProcessor` and `PairedParallelProcessor` +/// traits from `paraseq` to enable efficient parallel encoding of FASTX files. +#[derive(Clone)] +struct Encoder { + /// Global writer (shared across threads) + writer: Arc>>>, + /// Thread-local writer buffer + thread_writer: BinseqWriter>, +} + +impl Encoder { + /// Create a new encoder with a global writer + pub fn new(writer: BinseqWriter>) -> Result { + let thread_writer = writer.new_headless_buffer()?; + Ok(Self { + writer: Arc::new(Mutex::new(writer)), + thread_writer, + }) + } + /// Finish the stream on the global writer + pub fn finish(&mut self) -> Result<()> { + self.writer.lock().finish()?; + Ok(()) + } +} + +impl ParallelProcessor for Encoder { + fn process_record(&mut self, record: Rf) -> paraseq::Result<()> { + let seq = record.seq(); + let seq_record = SequencingRecordBuilder::default() + .s_header(record.id()) + .s_seq(&seq) + .opt_s_qual(record.qual()) + .build() + .map_err(IntoProcessError::into_process_error)?; + self.thread_writer + .push(seq_record) + .map_err(IntoProcessError::into_process_error)?; + Ok(()) + } + + fn on_batch_complete(&mut self) -> paraseq::Result<()> { + self.writer + .lock() + .ingest(&mut self.thread_writer) + .map_err(IntoProcessError::into_process_error)?; + Ok(()) + } +} + +impl PairedParallelProcessor for Encoder { + fn process_record_pair(&mut self, record1: Rf, record2: Rf) -> paraseq::Result<()> { + let sseq = record1.seq(); + let xseq = record2.seq(); + let seq_record = SequencingRecordBuilder::default() + .s_header(record1.id()) + .s_seq(&sseq) + .opt_s_qual(record1.qual()) + .x_header(record2.id()) + .x_seq(&xseq) + .opt_x_qual(record2.qual()) + .build() + .map_err(IntoProcessError::into_process_error)?; + + self.thread_writer + .push(seq_record) + .map_err(IntoProcessError::into_process_error)?; + Ok(()) + } + + fn on_batch_complete(&mut self) -> paraseq::Result<()> { + self.writer + .lock() + .ingest(&mut self.thread_writer) + .map_err(IntoProcessError::into_process_error)?; + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::write::Format; + use std::io::Cursor; + + #[test] + fn test_encoder_builder_construction() { + let builder = BinseqWriterBuilder::new(Format::Vbq); + let handle = Box::new(Cursor::new(Vec::new())); + let encoder_builder = FastxEncoderBuilder::new(builder, handle); + + assert!(encoder_builder.input.is_none()); + assert_eq!(encoder_builder.threads, 0); + } + + #[test] + fn test_encoder_builder_input_methods() { + let builder = BinseqWriterBuilder::new(Format::Vbq); + let handle = Box::new(Cursor::new(Vec::new())); + let encoder_builder = FastxEncoderBuilder::new(builder, handle) + .input("test.fastq") + .threads(4); + + assert!(matches!(encoder_builder.input, Some(FastxInput::Single(_)))); + assert_eq!(encoder_builder.threads, 4); + } + + #[test] + fn test_encoder_builder_stdin() { + let builder = BinseqWriterBuilder::new(Format::Vbq); + let handle = Box::new(Cursor::new(Vec::new())); + let encoder_builder = FastxEncoderBuilder::new(builder, handle).input_stdin(); + + assert!(matches!(encoder_builder.input, Some(FastxInput::Stdin))); + } + + #[test] + fn test_encoder_builder_paired() { + let builder = BinseqWriterBuilder::new(Format::Vbq); + let handle = Box::new(Cursor::new(Vec::new())); + let encoder_builder = + FastxEncoderBuilder::new(builder, handle).input_paired("r1.fastq", "r2.fastq"); + + assert!(matches!( + encoder_builder.input, + Some(FastxInput::Paired(_, _)) + )); + // Should automatically set paired mode + assert!(encoder_builder.builder.paired); + } +} diff --git a/src/utils/mod.rs b/src/utils/mod.rs new file mode 100644 index 0000000..031c492 --- /dev/null +++ b/src/utils/mod.rs @@ -0,0 +1,7 @@ +//! Utility modules for working with BINSEQ files + +#[cfg(feature = "paraseq")] +pub mod fastx; + +#[cfg(feature = "paraseq")] +pub use fastx::FastxEncoderBuilder; diff --git a/src/vbq/writer.rs b/src/vbq/writer.rs index f7c01ee..262004a 100644 --- a/src/vbq/writer.rs +++ b/src/vbq/writer.rs @@ -78,44 +78,6 @@ use crate::vbq::header::{SIZE_BLOCK_HEADER, SIZE_HEADER}; use crate::vbq::index::{INDEX_END_MAGIC, IndexHeader}; use crate::vbq::{BlockIndex, BlockRange}; -/// Calculates the storage size in bytes required for a record without quality scores -/// -/// This function calculates the total size needed to store a record in the VBINSEQ format, -/// including the flag, sequence lengths, and the encoded sequence data. The formula -/// used is: `S = w(Cs + Cx + 3)` where: -/// -/// - `w`: Word size (8 bytes) -/// - `Cs`: Chunk size of the primary sequence in 64-bit words -/// - `Cx`: Chunk size of the extended sequence in 64-bit words (for paired-end reads) -/// - `3`: Additional words for flag, primary length, and extended length -/// -/// # Parameters -/// -/// * `schunk` - Number of 64-bit words needed for the primary sequence -/// * `xchunk` - Number of 64-bit words needed for the extended sequence (0 for single-end) -/// -/// # Returns -/// -/// The total size in bytes needed to store the record -pub fn record_byte_size(schunk: usize, xchunk: usize, has_flags: bool) -> usize { - 8 * (schunk + xchunk + if has_flags { 3 } else { 2 }) -} - -fn record_byte_size_quality_header( - schunk: usize, - xchunk: usize, - squal: usize, - xqual: usize, - sheader: usize, - xheader: usize, - has_flags: bool, -) -> usize { - // counting the header length bytes (u64) - let bytes_sheader = if sheader > 0 { sheader + 8 } else { 0 }; - let bytes_xheader = if xheader > 0 { xheader + 8 } else { 0 }; - record_byte_size(schunk, xchunk, has_flags) + squal + xqual + bytes_sheader + bytes_xheader -} - /// A builder for creating configured `VBinseqWriter` instances /// /// This builder provides a fluent interface for configuring and creating a @@ -359,7 +321,13 @@ impl VBinseqWriter { inner, header, encoder: Encoder::with_policy(header.bits, policy), - cblock: BlockWriter::new(header.block as usize, header.compressed, header.flags), + cblock: BlockWriter::new( + header.block as usize, + header.compressed, + header.flags, + header.qual, + header.headers, + ), ranges: Vec::new(), bytes_written: 0, records_written: 0, @@ -545,8 +513,9 @@ impl VBinseqWriter { /// writer.finish().unwrap(); /// ``` pub fn push(&mut self, record: SequencingRecord) -> Result { - // Check configuration mismatches - if record.is_paired() != self.header.paired { + // Check paired status - writer can require paired (record must have R2), + // but if writer is single-end, we simply ignore any R2 data in the record. + if self.header.paired && !record.is_paired() { return Err(WriteError::ConfigurationMismatch { attribute: "paired", expected: self.header.paired, @@ -554,7 +523,10 @@ impl VBinseqWriter { } .into()); } - if record.has_qualities() != self.header.qual { + + // For qualities and headers: the writer can require them (record must have them), + // but if the writer doesn't need them, we simply ignore any extra data in the record. + if self.header.qual && !record.has_qualities() { return Err(WriteError::ConfigurationMismatch { attribute: "qual", expected: self.header.qual, @@ -562,7 +534,7 @@ impl VBinseqWriter { } .into()); } - if record.has_headers() != self.header.headers { + if self.header.headers && !record.has_headers() { return Err(WriteError::ConfigurationMismatch { attribute: "headers", expected: self.header.headers, @@ -571,21 +543,20 @@ impl VBinseqWriter { .into()); } + let record_size = record.configured_size_vbq( + self.header.paired, + self.header.flags, + self.header.headers, + self.header.qual, + self.header.bits, + ); + if record.is_paired() { // encode the sequences if let Some((sbuffer, xbuffer)) = self .encoder .encode_paired(record.s_seq, record.x_seq.unwrap_or_default())? { - let record_size = record_byte_size_quality_header( - sbuffer.len(), - xbuffer.len(), - record.s_qual.map_or(0, <[u8]>::len), - record.x_qual.map_or(0, <[u8]>::len), - record.s_header.map_or(0, <[u8]>::len), - record.x_header.map_or(0, <[u8]>::len), - self.header.flags, - ); if self.cblock.exceeds_block_size(record_size)? { impl_flush_block( &mut self.inner, @@ -604,15 +575,6 @@ impl VBinseqWriter { } else { // encode the sequence if let Some(sbuffer) = self.encoder.encode_single(record.s_seq)? { - let record_size = record_byte_size_quality_header( - sbuffer.len(), - 0, - record.s_qual.map_or(0, <[u8]>::len), - 0, - record.s_header.map_or(0, <[u8]>::len), - 0, - self.header.flags, - ); if self.cblock.exceeds_block_size(record_size)? { impl_flush_block( &mut self.inner, @@ -874,9 +836,19 @@ struct BlockWriter { compress: bool, /// Has flags has_flags: bool, + /// Has quality scores + has_qualities: bool, + /// Has headers + has_headers: bool, } impl BlockWriter { - fn new(block_size: usize, compress: bool, has_flags: bool) -> Self { + fn new( + block_size: usize, + compress: bool, + has_flags: bool, + has_qualities: bool, + has_headers: bool, + ) -> Self { Self { pos: 0, starts: Vec::default(), @@ -887,6 +859,8 @@ impl BlockWriter { padding: vec![0; block_size], compress, has_flags, + has_qualities, + has_headers, } } @@ -910,7 +884,7 @@ impl BlockWriter { // Tracks the record start position self.starts.push(self.pos); - // Write the flag + // Write the flag (only if configured) if self.has_flags { self.write_flag(record.flag.unwrap_or(0))?; } @@ -919,24 +893,40 @@ impl BlockWriter { self.write_length(record.s_seq.len() as u64)?; self.write_length(record.x_seq.map_or(0, <[u8]>::len) as u64)?; - // Write the primary sequence and optional quality + // Write the primary sequence self.write_buffer(sbuf)?; - if let Some(qual) = record.s_qual { + + // Write primary quality (only if configured) + if self.has_qualities + && let Some(qual) = record.s_qual + { self.write_u8buf(qual)?; } - if let Some(sheader) = record.s_header { + + // Write primary header (only if configured) + if self.has_headers + && let Some(sheader) = record.s_header + { self.write_length(sheader.len() as u64)?; self.write_u8buf(sheader)?; } - // Write the optional extended sequence and optional quality + // Write the optional extended sequence if let Some(xbuf) = xbuf { self.write_buffer(xbuf)?; } - if let Some(qual) = record.x_qual { + + // Write extended quality (only if configured) + if self.has_qualities + && let Some(qual) = record.x_qual + { self.write_u8buf(qual)?; } - if let Some(xheader) = record.x_header { + + // Write extended header (only if configured) + if self.has_headers + && let Some(xheader) = record.x_header + { self.write_length(xheader.len() as u64)?; self.write_u8buf(xheader)?; } @@ -1526,14 +1516,4 @@ mod tests { Ok(()) } - - #[test] - #[allow(clippy::identity_op)] - fn test_record_byte_size() { - let size = record_byte_size(2, 0, true); - assert_eq!(size, 8 * (2 + 0 + 3)); // 40 bytes - - let size = record_byte_size(4, 8, true); - assert_eq!(size, 8 * (4 + 8 + 3)); // 128 bytes - } } diff --git a/src/write.rs b/src/write.rs index 0b97814..c7cfd23 100644 --- a/src/write.rs +++ b/src/write.rs @@ -120,8 +120,8 @@ impl Format { /// | `headless(true)` | applied | applied | applied | #[derive(Debug, Clone)] pub struct BinseqWriterBuilder { - format: Format, - paired: bool, + pub(crate) format: Format, + pub(crate) paired: bool, quality: bool, headers: bool, flags: bool, @@ -131,8 +131,8 @@ pub struct BinseqWriterBuilder { policy: Option, headless: bool, bitsize: Option, - slen: Option, - xlen: Option, + pub(crate) slen: Option, + pub(crate) xlen: Option, } impl BinseqWriterBuilder { @@ -240,6 +240,50 @@ impl BinseqWriterBuilder { self } + /// Encode FASTX file(s) to BINSEQ format + /// + /// This method returns a [`FastxEncoderBuilder`] that allows you to configure + /// the input source and threading options before executing the encoding. + /// + /// This is an alternative to [`build`](Self::build) that directly processes + /// FASTX files using parallel processing. + /// + /// # Availability + /// + /// This method is only available when the `paraseq` feature is enabled. + /// + /// # Example + /// + /// ```rust,no_run + /// use binseq::write::{BinseqWriterBuilder, Format}; + /// use std::fs::File; + /// + /// // Encode from stdin to VBQ + /// let writer = BinseqWriterBuilder::new(Format::Vbq) + /// .quality(true) + /// .headers(true) + /// .encode_fastx(File::create("output.vbq")?) + /// .input_stdin() + /// .threads(8) + /// .run()?; + /// + /// // Encode paired-end reads + /// let writer = BinseqWriterBuilder::new(Format::Vbq) + /// .quality(true) + /// .encode_fastx(File::create("output.vbq")?) + /// .input_paired("R1.fastq", "R2.fastq") + /// .run()?; + /// # Ok::<(), binseq::Error>(()) + /// ``` + #[cfg(feature = "paraseq")] + #[must_use] + pub fn encode_fastx( + self, + output: W, + ) -> crate::utils::FastxEncoderBuilder { + crate::utils::FastxEncoderBuilder::new(self, Box::new(output)) + } + /// Build the writer /// /// # Errors @@ -776,4 +820,691 @@ mod tests { Ok(()) } + + // ==================== Record Specification Tests ==================== + // + // These tests verify that writers correctly handle records with different + // levels of specification relative to the writer's configuration: + // - Under-specified: record is missing data the writer needs (should error) + // - Over-specified: record has extra data the writer ignores (should succeed) + // - Correctly-specified: record matches writer config exactly (should succeed) + + /// Helper to create a minimal single-end record (sequence only) + fn minimal_single_record() -> SequencingRecord<'static> { + SequencingRecordBuilder::default() + .s_seq(b"ACGTACGTACGTACGTACGTACGTACGTACGT") + .build() + .unwrap() + } + + /// Helper to create a minimal paired record (sequences only) + fn minimal_paired_record() -> SequencingRecord<'static> { + SequencingRecordBuilder::default() + .s_seq(b"ACGTACGTACGTACGTACGTACGTACGTACGT") + .x_seq(b"TGCATGCATGCATGCATGCATGCATGCATGCA") + .build() + .unwrap() + } + + /// Helper to create a fully-specified single-end record + fn full_single_record() -> SequencingRecord<'static> { + SequencingRecordBuilder::default() + .s_seq(b"ACGTACGTACGTACGTACGTACGTACGTACGT") + .s_qual(b"IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII") + .s_header(b"read1") + .flag(42u64) + .build() + .unwrap() + } + + /// Helper to create a fully-specified paired record + fn full_paired_record() -> SequencingRecord<'static> { + SequencingRecordBuilder::default() + .s_seq(b"ACGTACGTACGTACGTACGTACGTACGTACGT") + .s_qual(b"IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII") + .s_header(b"read1") + .x_seq(b"TGCATGCATGCATGCATGCATGCATGCATGCA") + .x_qual(b"JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ") + .x_header(b"read2") + .flag(42u64) + .build() + .unwrap() + } + + // ==================== VBQ Tests ==================== + + #[test] + fn test_vbq_single_minimal_writer_minimal_record() -> Result<()> { + // Writer: single-end, no quality, no headers, no flags + // Record: single-end, no quality, no headers, no flags + // Expected: success (correctly specified) + let mut writer = BinseqWriterBuilder::new(Format::Vbq) + .paired(false) + .quality(false) + .headers(false) + .flags(false) + .build(Cursor::new(Vec::new()))?; + + let record = minimal_single_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + #[test] + fn test_vbq_single_minimal_writer_full_record() -> Result<()> { + // Writer: single-end, no quality, no headers, no flags + // Record: single-end, with quality, headers, flags + // Expected: success (over-specified - extra data ignored) + let mut writer = BinseqWriterBuilder::new(Format::Vbq) + .paired(false) + .quality(false) + .headers(false) + .flags(false) + .build(Cursor::new(Vec::new()))?; + + let record = full_single_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + #[test] + fn test_vbq_single_full_writer_minimal_record() -> Result<()> { + // Writer: single-end, with quality, headers, flags + // Record: single-end, no quality, no headers, no flags + // Expected: error (under-specified) + let mut writer = BinseqWriterBuilder::new(Format::Vbq) + .paired(false) + .quality(true) + .headers(true) + .flags(true) + .build(Cursor::new(Vec::new()))?; + + let record = minimal_single_record(); + let result = writer.push(record); + assert!(result.is_err()); + Ok(()) + } + + #[test] + fn test_vbq_single_full_writer_full_record() -> Result<()> { + // Writer: single-end, with quality, headers, flags + // Record: single-end, with quality, headers, flags + // Expected: success (correctly specified) + let mut writer = BinseqWriterBuilder::new(Format::Vbq) + .paired(false) + .quality(true) + .headers(true) + .flags(true) + .build(Cursor::new(Vec::new()))?; + + let record = full_single_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + #[test] + fn test_vbq_paired_writer_single_record() -> Result<()> { + // Writer: paired + // Record: single-end + // Expected: error (under-specified - missing R2) + let mut writer = BinseqWriterBuilder::new(Format::Vbq) + .paired(true) + .quality(false) + .headers(false) + .flags(false) + .build(Cursor::new(Vec::new()))?; + + let record = minimal_single_record(); + let result = writer.push(record); + assert!(result.is_err()); + Ok(()) + } + + #[test] + fn test_vbq_single_writer_paired_record() -> Result<()> { + // Writer: single-end + // Record: paired + // Expected: success (over-specified - R2 ignored) + let mut writer = BinseqWriterBuilder::new(Format::Vbq) + .paired(false) + .quality(false) + .headers(false) + .flags(false) + .build(Cursor::new(Vec::new()))?; + + let record = minimal_paired_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + #[test] + fn test_vbq_paired_minimal_writer_paired_full_record() -> Result<()> { + // Writer: paired, no quality, no headers, no flags + // Record: paired, with quality, headers, flags + // Expected: success (over-specified) + let mut writer = BinseqWriterBuilder::new(Format::Vbq) + .paired(true) + .quality(false) + .headers(false) + .flags(false) + .build(Cursor::new(Vec::new()))?; + + let record = full_paired_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + #[test] + fn test_vbq_paired_full_writer_paired_full_record() -> Result<()> { + // Writer: paired, with quality, headers, flags + // Record: paired, with quality, headers, flags + // Expected: success (correctly specified) + let mut writer = BinseqWriterBuilder::new(Format::Vbq) + .paired(true) + .quality(true) + .headers(true) + .flags(true) + .build(Cursor::new(Vec::new()))?; + + let record = full_paired_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + // ==================== CBQ Tests ==================== + + #[test] + fn test_cbq_single_minimal_writer_minimal_record() -> Result<()> { + // Writer: single-end, no quality, no headers, no flags + // Record: single-end, no quality, no headers, no flags + // Expected: success (correctly specified) + let mut writer = BinseqWriterBuilder::new(Format::Cbq) + .paired(false) + .quality(false) + .headers(false) + .flags(false) + .build(Cursor::new(Vec::new()))?; + + let record = minimal_single_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + #[test] + fn test_cbq_single_minimal_writer_full_record() -> Result<()> { + // Writer: single-end, no quality, no headers, no flags + // Record: single-end, with quality, headers, flags + // Expected: success (over-specified - extra data ignored) + let mut writer = BinseqWriterBuilder::new(Format::Cbq) + .paired(false) + .quality(false) + .headers(false) + .flags(false) + .build(Cursor::new(Vec::new()))?; + + let record = full_single_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + #[test] + fn test_cbq_single_full_writer_minimal_record() -> Result<()> { + // Writer: single-end, with quality, headers, flags + // Record: single-end, no quality, no headers, no flags + // Expected: error (under-specified) + let mut writer = BinseqWriterBuilder::new(Format::Cbq) + .paired(false) + .quality(true) + .headers(true) + .flags(true) + .build(Cursor::new(Vec::new()))?; + + let record = minimal_single_record(); + let result = writer.push(record); + assert!(result.is_err()); + Ok(()) + } + + #[test] + fn test_cbq_single_full_writer_full_record() -> Result<()> { + // Writer: single-end, with quality, headers, flags + // Record: single-end, with quality, headers, flags + // Expected: success (correctly specified) + let mut writer = BinseqWriterBuilder::new(Format::Cbq) + .paired(false) + .quality(true) + .headers(true) + .flags(true) + .build(Cursor::new(Vec::new()))?; + + let record = full_single_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + #[test] + fn test_cbq_paired_writer_single_record() -> Result<()> { + // Writer: paired + // Record: single-end + // Expected: error (under-specified - missing R2) + let mut writer = BinseqWriterBuilder::new(Format::Cbq) + .paired(true) + .quality(false) + .headers(false) + .flags(false) + .build(Cursor::new(Vec::new()))?; + + let record = minimal_single_record(); + let result = writer.push(record); + assert!(result.is_err()); + Ok(()) + } + + #[test] + fn test_cbq_single_writer_paired_record() -> Result<()> { + // Writer: single-end + // Record: paired + // Expected: success (over-specified - R2 ignored) + let mut writer = BinseqWriterBuilder::new(Format::Cbq) + .paired(false) + .quality(false) + .headers(false) + .flags(false) + .build(Cursor::new(Vec::new()))?; + + let record = minimal_paired_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + #[test] + fn test_cbq_paired_minimal_writer_paired_full_record() -> Result<()> { + // Writer: paired, no quality, no headers, no flags + // Record: paired, with quality, headers, flags + // Expected: success (over-specified) + let mut writer = BinseqWriterBuilder::new(Format::Cbq) + .paired(true) + .quality(false) + .headers(false) + .flags(false) + .build(Cursor::new(Vec::new()))?; + + let record = full_paired_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + #[test] + fn test_cbq_paired_full_writer_paired_full_record() -> Result<()> { + // Writer: paired, with quality, headers, flags + // Record: paired, with quality, headers, flags + // Expected: success (correctly specified) + let mut writer = BinseqWriterBuilder::new(Format::Cbq) + .paired(true) + .quality(true) + .headers(true) + .flags(true) + .build(Cursor::new(Vec::new()))?; + + let record = full_paired_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + // ==================== BQ Tests ==================== + // Note: BQ format has fixed-length sequences and doesn't support headers + + #[test] + fn test_bq_single_minimal_writer_minimal_record() -> Result<()> { + // Writer: single-end, no quality, no flags + // Record: single-end, no quality, no flags + // Expected: success (correctly specified) + let mut writer = BinseqWriterBuilder::new(Format::Bq) + .slen(32) + .paired(false) + .quality(false) + .flags(false) + .build(Cursor::new(Vec::new()))?; + + let record = minimal_single_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + #[test] + fn test_bq_single_minimal_writer_full_record() -> Result<()> { + // Writer: single-end, no quality, no flags + // Record: single-end, with quality, headers, flags + // Expected: success (over-specified - extra data ignored) + let mut writer = BinseqWriterBuilder::new(Format::Bq) + .slen(32) + .paired(false) + .quality(false) + .flags(false) + .build(Cursor::new(Vec::new()))?; + + let record = full_single_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + #[test] + fn test_bq_single_with_quality_writer_minimal_record() -> Result<()> { + // Writer: single-end, with quality (note: BQ ignores quality setting) + // Record: single-end, no quality + // Expected: success (BQ format doesn't support quality scores, setting is ignored) + let mut writer = BinseqWriterBuilder::new(Format::Bq) + .slen(32) + .paired(false) + .quality(true) // This is ignored for BQ format + .build(Cursor::new(Vec::new()))?; + + // BQ always reports has_quality as false + assert!(!writer.has_quality()); + + let record = minimal_single_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + #[test] + fn test_bq_single_with_quality_writer_full_record() -> Result<()> { + // Writer: single-end, with quality + // Record: single-end, with quality + // Expected: success (correctly specified) + let mut writer = BinseqWriterBuilder::new(Format::Bq) + .slen(32) + .paired(false) + .quality(true) + .build(Cursor::new(Vec::new()))?; + + let record = full_single_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + #[test] + fn test_bq_paired_writer_single_record() -> Result<()> { + // Writer: paired + // Record: single-end + // Expected: error (under-specified - missing R2) + let mut writer = BinseqWriterBuilder::new(Format::Bq) + .slen(32) + .xlen(32) + .paired(true) + .quality(false) + .build(Cursor::new(Vec::new()))?; + + let record = minimal_single_record(); + let result = writer.push(record); + assert!(result.is_err()); + Ok(()) + } + + #[test] + fn test_bq_single_writer_paired_record() -> Result<()> { + // Writer: single-end + // Record: paired + // Expected: success (over-specified - R2 ignored) + let mut writer = BinseqWriterBuilder::new(Format::Bq) + .slen(32) + .paired(false) + .quality(false) + .build(Cursor::new(Vec::new()))?; + + let record = minimal_paired_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + #[test] + fn test_bq_paired_minimal_writer_paired_full_record() -> Result<()> { + // Writer: paired, no quality, no flags + // Record: paired, with quality, headers, flags + // Expected: success (over-specified) + let mut writer = BinseqWriterBuilder::new(Format::Bq) + .slen(32) + .xlen(32) + .paired(true) + .quality(false) + .flags(false) + .build(Cursor::new(Vec::new()))?; + + let record = full_paired_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + #[test] + fn test_bq_paired_full_writer_paired_full_record() -> Result<()> { + // Writer: paired, with quality, flags + // Record: paired, with quality, headers, flags + // Expected: success (correctly specified, headers ignored for BQ) + let mut writer = BinseqWriterBuilder::new(Format::Bq) + .slen(32) + .xlen(32) + .paired(true) + .quality(true) + .flags(true) + .build(Cursor::new(Vec::new()))?; + + let record = full_paired_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + // ==================== Configured Size Calculation Tests ==================== + + #[test] + fn test_configured_size_cbq_single_minimal() { + let record = minimal_single_record(); + // 32 nucleotides = 1 u64 word = 8 bytes + let size = record.configured_size_cbq(false, false, false, false); + assert_eq!(size, 8); + } + + #[test] + fn test_configured_size_cbq_single_with_flags() { + let record = full_single_record(); + // 32 nucleotides = 8 bytes + 8 bytes flag + let size = record.configured_size_cbq(false, true, false, false); + assert_eq!(size, 16); + } + + #[test] + fn test_configured_size_cbq_single_with_all() { + let record = full_single_record(); + // 32 nucleotides = 8 bytes + // + 8 bytes flag + // + 5 bytes header ("read1") + // + 32 bytes quality + let size = record.configured_size_cbq(false, true, true, true); + assert_eq!(size, 8 + 8 + 5 + 32); + } + + #[test] + fn test_configured_size_cbq_paired_minimal() { + let record = full_paired_record(); + // s_seq: 32 nucleotides = 8 bytes + // x_seq: 32 nucleotides = 8 bytes + let size = record.configured_size_cbq(true, false, false, false); + assert_eq!(size, 16); + } + + #[test] + fn test_configured_size_cbq_paired_with_all() { + let record = full_paired_record(); + // s_seq: 32 nucleotides = 8 bytes + // x_seq: 32 nucleotides = 8 bytes + // flag: 8 bytes + // s_header: 5 bytes ("read1") + // x_header: 5 bytes ("read2") + // s_qual: 32 bytes + // x_qual: 32 bytes + let size = record.configured_size_cbq(true, true, true, true); + assert_eq!(size, 8 + 8 + 8 + 5 + 5 + 32 + 32); + } + + #[test] + fn test_configured_size_cbq_paired_record_single_writer() { + // A paired record being written to a single-end writer + // should only count R1 data + let record = full_paired_record(); + let size = record.configured_size_cbq(false, true, true, true); + // Only s_seq (8) + flag (8) + s_header (5) + s_qual (32) + assert_eq!(size, 8 + 8 + 5 + 32); + } + + #[test] + fn test_configured_size_vbq_single_minimal() { + use bitnuc::BitSize; + let record = minimal_single_record(); + // s_len (8) + x_len (8) + s_seq (32 nucs = 1 word = 8 bytes) + let size = record.configured_size_vbq(false, false, false, false, BitSize::Two); + assert_eq!(size, 16 + 8); + } + + #[test] + fn test_configured_size_vbq_single_with_flags() { + use bitnuc::BitSize; + let record = full_single_record(); + // s_len (8) + x_len (8) + flag (8) + s_seq (8) + let size = record.configured_size_vbq(false, true, false, false, BitSize::Two); + assert_eq!(size, 16 + 8 + 8); + } + + #[test] + fn test_configured_size_vbq_single_with_all() { + use bitnuc::BitSize; + let record = full_single_record(); + // s_len (8) + x_len (8) + flag (8) + s_seq (8) + s_qual (32) + s_header_len (8) + s_header (5) + let size = record.configured_size_vbq(false, true, true, true, BitSize::Two); + assert_eq!(size, 16 + 8 + 8 + 32 + 8 + 5); + } + + #[test] + fn test_configured_size_vbq_paired_minimal() { + use bitnuc::BitSize; + let record = full_paired_record(); + // s_len (8) + x_len (8) + s_seq (8) + x_seq (8) + let size = record.configured_size_vbq(true, false, false, false, BitSize::Two); + assert_eq!(size, 16 + 8 + 8); + } + + #[test] + fn test_configured_size_vbq_paired_with_all() { + use bitnuc::BitSize; + let record = full_paired_record(); + // s_len (8) + x_len (8) + flag (8) + s_seq (8) + x_seq (8) + // + s_qual (32) + x_qual (32) + // + s_header_len (8) + s_header (5) + x_header_len (8) + x_header (5) + let size = record.configured_size_vbq(true, true, true, true, BitSize::Two); + assert_eq!(size, 16 + 8 + 8 + 8 + 32 + 32 + 8 + 5 + 8 + 5); + } + + #[test] + fn test_configured_size_vbq_paired_record_single_writer() { + use bitnuc::BitSize; + // A paired record being written to a single-end writer + // should only count R1 data + let record = full_paired_record(); + let size = record.configured_size_vbq(false, true, true, true, BitSize::Two); + // s_len (8) + x_len (8) + flag (8) + s_seq (8) + s_qual (32) + s_header_len (8) + s_header (5) + assert_eq!(size, 16 + 8 + 8 + 32 + 8 + 5); + } + + #[test] + fn test_configured_size_vbq_four_bit_encoding() { + use bitnuc::BitSize; + let record = minimal_single_record(); + // With 4-bit encoding: 2 nucleotides per byte, 16 per word + // 32 nucleotides = 2 words = 16 bytes + // s_len (8) + x_len (8) + s_seq (16) + let size = record.configured_size_vbq(false, false, false, false, BitSize::Four); + assert_eq!(size, 16 + 16); + } + + // ==================== Multiple Records Tests ==================== + + #[test] + fn test_vbq_multiple_records_mixed_specification() -> Result<()> { + // Writer configured minimally, records over-specified + let mut writer = BinseqWriterBuilder::new(Format::Vbq) + .paired(false) + .quality(false) + .headers(false) + .flags(false) + .build(Cursor::new(Vec::new()))?; + + // Push minimal record + assert!(writer.push(minimal_single_record())?); + // Push full record (over-specified, should work) + assert!(writer.push(full_single_record())?); + // Push paired record (over-specified, R2 ignored) + assert!(writer.push(full_paired_record())?); + + writer.finish()?; + Ok(()) + } + + #[test] + fn test_cbq_multiple_records_mixed_specification() -> Result<()> { + // Writer configured minimally, records over-specified + let mut writer = BinseqWriterBuilder::new(Format::Cbq) + .paired(false) + .quality(false) + .headers(false) + .flags(false) + .build(Cursor::new(Vec::new()))?; + + // Push minimal record + assert!(writer.push(minimal_single_record())?); + // Push full record (over-specified, should work) + assert!(writer.push(full_single_record())?); + // Push paired record (over-specified, R2 ignored) + assert!(writer.push(full_paired_record())?); + + writer.finish()?; + Ok(()) + } + + #[test] + fn test_bq_multiple_records_mixed_specification() -> Result<()> { + // Writer configured minimally, records over-specified + let mut writer = BinseqWriterBuilder::new(Format::Bq) + .slen(32) + .paired(false) + .quality(false) + .flags(false) + .build(Cursor::new(Vec::new()))?; + + // Push minimal record + assert!(writer.push(minimal_single_record())?); + // Push full record (over-specified, should work) + assert!(writer.push(full_single_record())?); + // Push paired record (over-specified, R2 ignored) + assert!(writer.push(full_paired_record())?); + + writer.finish()?; + Ok(()) + } }