Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ categories = ["science::bioinformatics", "encoding", "data-structures"]
keywords = ["bioinformatics", "nucleotide", "sequencing", "genomics", "fastq"]

[dependencies]
anyhow = "1.0.100"
anyhow = {version = "1.0.100", optional = true}
auto_impl = "1.3.0"
bitnuc = "0.4.0"
bytemuck = { version = "1.24.0", features = ["derive", "extern_crate_alloc"] }
Expand All @@ -20,17 +20,24 @@ itoa = "1.0.17"
memchr = "2.7.6"
memmap2 = "0.9.9"
num_cpus = "1.17.0"
paraseq = { version = "0.4.8", optional = true }
parking_lot = {version = "0.12.5", optional = true }
rand = { version = "0.9.2", features = ["small_rng"] }
sucds = "0.8.3"
thiserror = "2.0.17"
zstd = { version = "0.13.3", features = ["zstdmt"] }

[dev-dependencies]
parking_lot = "0.12.5"
anyhow = "1.0.100"
parking_lot = "0.12.5"
clap = { version = "4.5.54", features = ["derive"] }
paraseq = "0.4.8"

[features]
default = ["paraseq", "anyhow"]
anyhow = ["dep:anyhow"]
paraseq = ["dep:paraseq", "dep:parking_lot"]

[lints.clippy]
pedantic = { level = "warn", priority = -1 }
cast_possible_truncation = "allow"
Expand Down
122 changes: 122 additions & 0 deletions examples/auto-write.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
use std::{fs::File, io::BufWriter};

use anyhow::Result;
use binseq::{BinseqWriterBuilder, write::Format};
use bitnuc::BitSize;
use clap::Parser;

type BoxedWriter = Box<dyn std::io::Write + Send>;

#[derive(Parser)]
struct Args {
/// Input FASTX to encode into BINSEQ format
#[clap(required = true)]
input: String,

/// Input FASTX to encode into BINSEQ format (R2)
#[clap(required = false)]
input2: Option<String>,

/// Output file path for BINSEQ format
#[clap(short = 'o', long)]
output: Option<String>,

/// Default prefix for writing BINSEQ: `<prefix>.<ext>`
#[clap(short = 'p', long, default_value = "output")]
prefix: String,

/// Format of the output BINSEQ file
///
/// [bq: bq|BQ|b, vbq: vbq|VBQ|v, cbq: cbq|CBQ|c]
#[clap(short = 'f', long)]
format: Option<Format>,

/// Exclude quality information in BINSEQ output
///
/// (bq ignores quality always)
#[clap(short = 'Q', long)]
exclude_quality: bool,

/// Exclude sequence headers in BINSEQ output
///
/// (bq ignores headers always)
#[clap(short = 'H', long)]
exclude_headers: bool,

/// Compression level for BINSEQ output (0: auto)
#[clap(long, default_value_t = 0)]
compression_level: i32,

/// Default BITSIZE for BINSEQ output (2: 2bit, 4: 4bit)
#[clap(long, default_value_t = 2)]
bitsize: u8,

/// Default BLOCKSIZE in KB for BINSEQ output (vbq,cbq)
#[clap(long, default_value_t = 128)]
blocksize: usize,

/// Number of threads to use for parallel processing, 0: all available
#[clap(short = 'T', long, default_value = "0")]
threads: usize,
}
impl Args {
/// Determines the output format based on the file extension or the provided format
fn format(&self) -> Format {
if let Some(format) = self.format {
format
} else {
if let Some(output) = &self.output {
match output.split(".").last() {
Some("bq") => Format::Bq,
Some("vbq") => Format::Vbq,
Some("cbq") => Format::Cbq,
_ => Format::default(),
}
} else {
Format::default()
}
}
}
fn bitsize(&self) -> BitSize {
match self.bitsize {
4 => BitSize::Four,
_ => BitSize::Two,
}
}

/// Creates an output file handle
fn ohandle(&self) -> Result<BoxedWriter> {
let path = if let Some(output) = &self.output {
output.to_string()
} else {
format!("{}{}", &self.prefix, self.format().extension())
};
let ofile = File::create(path).map(BufWriter::new)?;
Ok(Box::new(ofile))
}

fn is_paired(&self) -> bool {
self.input2.is_some()
}
}

fn main() -> Result<()> {
let args = Args::parse();
let handle = args.ohandle()?;
let builder = BinseqWriterBuilder::new(args.format())
.bitsize(args.bitsize())
.block_size(args.blocksize * 1024)
.headers(!args.exclude_headers)
.quality(!args.exclude_quality)
.compression_level(args.compression_level)
.encode_fastx(handle);
if args.is_paired() {
builder.input_paired(&args.input, args.input2.as_ref().unwrap())
} else {
builder.input(&args.input)
}
.threads(args.threads)
.run()?;

Ok(())
}
6 changes: 4 additions & 2 deletions src/bq/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -475,7 +475,9 @@ impl<W: Write> BinseqWriter<W> {
write_flag(&mut self.inner, record.flag().unwrap_or(0))?;
}

if record.is_paired() != self.encoder.header.is_paired() {
// Check paired status - writer can require paired (record must have R2),
// but if writer is single-end, we simply ignore any R2 data in the record.
if self.encoder.header.is_paired() && !record.is_paired() {
return Err(WriteError::ConfigurationMismatch {
attribute: "paired",
expected: self.encoder.header.is_paired(),
Expand All @@ -484,7 +486,7 @@ impl<W: Write> BinseqWriter<W> {
.into());
}

if record.is_paired() {
if self.encoder.header.is_paired() {
if let Some((sbuffer, xbuffer)) = self
.encoder
.encode_paired(record.s_seq, record.x_seq.unwrap_or_default())?
Expand Down
125 changes: 97 additions & 28 deletions src/cbq/core/block.rs
Original file line number Diff line number Diff line change
Expand Up @@ -160,31 +160,76 @@ impl ColumnarBlock {
self.nuclen = self.seq.len();
}

fn add_flag(&mut self, record: &SequencingRecord) {
if let Some(flag) = record.flag {
fn add_flag(&mut self, record: &SequencingRecord) -> Result<()> {
if self.header.has_flags() {
let Some(flag) = record.flag else {
return Err(WriteError::ConfigurationMismatch {
attribute: "flag",
expected: true,
actual: false,
}
.into());
};
self.flags.push(flag);
}
Ok(())
}

fn add_headers(&mut self, record: &SequencingRecord) {
if let Some(header) = record.s_header {
self.l_headers.push(header.len() as u64);
self.headers.extend_from_slice(header);
}
if let Some(header) = record.x_header {
self.l_headers.push(header.len() as u64);
self.headers.extend_from_slice(header);
fn add_headers(&mut self, record: &SequencingRecord) -> Result<()> {
if self.header.has_headers() {
let Some(sheader) = record.s_header else {
return Err(WriteError::ConfigurationMismatch {
attribute: "s_header",
expected: true,
actual: false,
}
.into());
};
self.l_headers.push(sheader.len() as u64);
self.headers.extend_from_slice(sheader);

if self.header.is_paired() {
let Some(xheader) = record.x_header else {
return Err(WriteError::ConfigurationMismatch {
attribute: "x_header",
expected: true,
actual: false,
}
.into());
};
self.l_headers.push(xheader.len() as u64);
self.headers.extend_from_slice(xheader);
}
}
Ok(())
}

/// Note: this does not check if quality scores are different lengths from sequence
fn add_quality(&mut self, record: &SequencingRecord) {
if let Some(qual) = record.s_qual {
self.qual.extend_from_slice(qual);
}
if let Some(qual) = record.x_qual {
self.qual.extend_from_slice(qual);
fn add_quality(&mut self, record: &SequencingRecord) -> Result<()> {
if self.header.has_qualities() {
let Some(squal) = record.s_qual() else {
return Err(WriteError::ConfigurationMismatch {
attribute: "s_qual",
expected: true,
actual: false,
}
.into());
};
self.qual.extend_from_slice(squal);

if self.header.is_paired() {
let Some(xqual) = record.x_qual() else {
return Err(WriteError::ConfigurationMismatch {
attribute: "x_qual",
expected: true,
actual: false,
}
.into());
};
self.qual.extend_from_slice(xqual);
}
}
Ok(())
}

/// Calculate the usage of the block as a percentage
Expand All @@ -194,7 +239,13 @@ impl ColumnarBlock {
}

pub(crate) fn can_fit(&self, record: &SequencingRecord<'_>) -> bool {
self.current_size + record.size() <= self.header.block_size as usize
let configured_size = record.configured_size_cbq(
self.header.is_paired(),
self.header.has_flags(),
self.header.has_headers(),
self.header.has_qualities(),
);
self.current_size + configured_size <= self.header.block_size as usize
}

pub(crate) fn can_ingest(&self, other: &Self) -> bool {
Expand All @@ -203,23 +254,32 @@ impl ColumnarBlock {

/// Ensure that the record can be pushed into the block
fn validate_record(&self, record: &SequencingRecord) -> Result<()> {
let configured_size = record.configured_size_cbq(
self.header.is_paired(),
self.header.has_flags(),
self.header.has_headers(),
self.header.has_qualities(),
);

if !self.can_fit(record) {
if record.size() > self.header.block_size as usize {
if configured_size > self.header.block_size as usize {
return Err(WriteError::RecordSizeExceedsMaximumBlockSize(
record.size(),
configured_size,
self.header.block_size as usize,
)
.into());
}
return Err(CbqError::BlockFull {
current_size: self.current_size,
record_size: record.size(),
record_size: configured_size,
block_size: self.header.block_size as usize,
}
.into());
}

if record.is_paired() != self.header.is_paired() {
// Check paired status - writer can require paired (record must have R2),
// but if writer is single-end, we simply ignore any R2 data in the record.
if self.header.is_paired() && !record.is_paired() {
return Err(WriteError::ConfigurationMismatch {
attribute: "paired",
expected: self.header.is_paired(),
Expand All @@ -228,7 +288,9 @@ impl ColumnarBlock {
.into());
}

if record.has_flags() != self.header.has_flags() {
// For flags, headers, and qualities: the writer can require them (record must have them),
// but if the writer doesn't need them, we simply ignore any extra data in the record.
if self.header.has_flags() && !record.has_flags() {
return Err(WriteError::ConfigurationMismatch {
attribute: "flags",
expected: self.header.has_flags(),
Expand All @@ -237,7 +299,7 @@ impl ColumnarBlock {
.into());
}

if record.has_headers() != self.header.has_headers() {
if self.header.has_headers() && !record.has_headers() {
return Err(WriteError::ConfigurationMismatch {
attribute: "headers",
expected: self.header.has_headers(),
Expand All @@ -246,7 +308,7 @@ impl ColumnarBlock {
.into());
}

if record.has_qualities() != self.header.has_qualities() {
if self.header.has_qualities() && !record.has_qualities() {
return Err(WriteError::ConfigurationMismatch {
attribute: "qualities",
expected: self.header.has_qualities(),
Expand All @@ -260,11 +322,18 @@ impl ColumnarBlock {
pub fn push(&mut self, record: SequencingRecord) -> Result<()> {
self.validate_record(&record)?;

let configured_size = record.configured_size_cbq(
self.header.is_paired(),
self.header.has_flags(),
self.header.has_headers(),
self.header.has_qualities(),
);

self.add_sequence(&record);
self.add_flag(&record);
self.add_headers(&record);
self.add_quality(&record);
self.current_size += record.size();
self.add_flag(&record)?;
self.add_headers(&record)?;
self.add_quality(&record)?;
self.current_size += configured_size;
self.num_records += 1;

Ok(())
Expand Down
Loading