|
| 1 | +//! # bq |
| 2 | +//! |
| 3 | +//! *.bq files are BINSEQ variants for **fixed-length** records and **does not support quality scores**. |
| 4 | +//! |
| 5 | +//! For variable-length records and optional quality scores use the [`vbq`](crate::vbq) module. |
| 6 | +//! |
| 7 | +//! This module contains the utilities for reading, writing, and interacting with BINSEQ files. |
| 8 | +//! |
| 9 | +//! For detailed information on the file format, see our [paper](https://www.biorxiv.org/content/10.1101/2025.04.08.647863v1). |
| 10 | +//! |
| 11 | +//! ## Usage |
| 12 | +//! |
| 13 | +//! ### Reading |
| 14 | +//! ```rust |
| 15 | +//! use binseq::{bq, BinseqRecord}; |
| 16 | +//! use rand::{thread_rng, Rng}; |
| 17 | +//! |
| 18 | +//! let path = "./data/subset.bq"; |
| 19 | +//! let reader = bq::MmapReader::new(path).unwrap(); |
| 20 | +//! |
| 21 | +//! // We can easily determine the number of records in the file |
| 22 | +//! let num_records = reader.num_records(); |
| 23 | +//! |
| 24 | +//! // We have random access to any record within the range |
| 25 | +//! let random_index = thread_rng().gen_range(0..num_records); |
| 26 | +//! let record = reader.get(random_index).unwrap(); |
| 27 | +//! |
| 28 | +//! // We can easily decode the (2bit)encoded sequence back to a sequence of bytes |
| 29 | +//! let mut sbuf = Vec::new(); |
| 30 | +//! let mut xbuf = Vec::new(); |
| 31 | +//! |
| 32 | +//! record.decode_s(&mut sbuf); |
| 33 | +//! if record.is_paired() { |
| 34 | +//! record.decode_x(&mut xbuf); |
| 35 | +//! } |
| 36 | +//! ``` |
| 37 | +//! |
| 38 | +//! ### Writing |
| 39 | +//! |
| 40 | +//! #### Writing unpaired sequences |
| 41 | +//! |
| 42 | +//! ```rust |
| 43 | +//! use binseq::bq; |
| 44 | +//! use std::fs::File; |
| 45 | +//! |
| 46 | +//! // Define a path for the output file |
| 47 | +//! let path = "./data/some_output.bq"; |
| 48 | +//! |
| 49 | +//! // Create the file handle |
| 50 | +//! let output_handle = File::create(path).unwrap(); |
| 51 | +//! |
| 52 | +//! // Initialize our BINSEQ header (64 bp, only primary) |
| 53 | +//! let header = bq::BinseqHeader::new(64); |
| 54 | +//! |
| 55 | +//! // Initialize our BINSEQ writer |
| 56 | +//! let mut writer = bq::BinseqWriterBuilder::default() |
| 57 | +//! .header(header) |
| 58 | +//! .build(output_handle) |
| 59 | +//! .unwrap(); |
| 60 | +//! |
| 61 | +//! // Generate a random sequence |
| 62 | +//! let seq = [b'A'; 64]; |
| 63 | +//! let flag = 0; |
| 64 | +//! |
| 65 | +//! // Write the sequence to the file |
| 66 | +//! writer.write_nucleotides(flag, &seq).unwrap(); |
| 67 | +//! |
| 68 | +//! // Close the file |
| 69 | +//! writer.flush().unwrap(); |
| 70 | +//! |
| 71 | +//! // Remove the file created |
| 72 | +//! std::fs::remove_file(path).unwrap(); |
| 73 | +//! ``` |
| 74 | +//! |
| 75 | +//! #### Writing paired sequences |
| 76 | +//! |
| 77 | +//! ```rust |
| 78 | +//! use binseq::bq; |
| 79 | +//! use std::fs::File; |
| 80 | +//! |
| 81 | +//! // Define a path for the output file |
| 82 | +//! let path = "./data/some_output.bq"; |
| 83 | +//! |
| 84 | +//! // Create the file handle |
| 85 | +//! let output_handle = File::create(path).unwrap(); |
| 86 | +//! |
| 87 | +//! // Initialize our BINSEQ header (64 bp and 128bp) |
| 88 | +//! let header = bq::BinseqHeader::new_extended(64, 128); |
| 89 | +//! |
| 90 | +//! // Initialize our BINSEQ writer |
| 91 | +//! let mut writer = bq::BinseqWriterBuilder::default() |
| 92 | +//! .header(header) |
| 93 | +//! .build(output_handle) |
| 94 | +//! .unwrap(); |
| 95 | +//! |
| 96 | +//! // Generate a random sequence |
| 97 | +//! let primary = [b'A'; 64]; |
| 98 | +//! let secondary = [b'C'; 128]; |
| 99 | +//! let flag = 0; |
| 100 | +//! |
| 101 | +//! // Write the sequence to the file |
| 102 | +//! writer.write_paired(flag, &primary, &secondary).unwrap(); |
| 103 | +//! |
| 104 | +//! // Close the file |
| 105 | +//! writer.flush().unwrap(); |
| 106 | +//! |
| 107 | +//! // Remove the file created |
| 108 | +//! std::fs::remove_file(path).unwrap(); |
| 109 | +//! ``` |
| 110 | +//! |
| 111 | +//! ## BQ file format |
| 112 | +//! |
| 113 | +//! A BINSEQ file consists of two sections: |
| 114 | +//! |
| 115 | +//! 1. Fixed-size header (32 bytes) |
| 116 | +//! 2. Record data section |
| 117 | +//! |
| 118 | +//! ### Header Format (32 bytes total) |
| 119 | +//! |
| 120 | +//! | Offset | Size (bytes) | Name | Description | Type | |
| 121 | +//! | ------ | ------------ | -------- | ---------------------------- | ------ | |
| 122 | +//! | 0 | 4 | magic | Magic number (0x42534551) | uint32 | |
| 123 | +//! | 4 | 1 | format | Format version (currently 2) | uint8 | |
| 124 | +//! | 5 | 4 | slen | Sequence length (primary) | uint32 | |
| 125 | +//! | 9 | 4 | xlen | Sequence length (secondary) | uint32 | |
| 126 | +//! | 13 | 19 | reserved | Reserved for future use | bytes | |
| 127 | +//! |
| 128 | +//! ### Record Format |
| 129 | +//! |
| 130 | +//! Each record consists of a: |
| 131 | +//! |
| 132 | +//! 1. Flag field (8 bytes, uint64) |
| 133 | +//! 2. Sequence data (ceil(N/32) \* 8 bytes, where N is sequence length) |
| 134 | +//! |
| 135 | +//! The flag field is implementation-defined and can be used for filtering, metadata, or other purposes. The placement of the flag field at the start of each record enables efficient filtering without reading sequence data. |
| 136 | +//! |
| 137 | +//! Total record size = 8 + (ceil(N/32) \* 8) bytes, where N is sequence length |
| 138 | +//! |
| 139 | +//! ## Encoding |
| 140 | +//! |
| 141 | +//! - Each nucleotide is encoded using 2 bits: |
| 142 | +//! - A = 00 |
| 143 | +//! - C = 01 |
| 144 | +//! - G = 10 |
| 145 | +//! - T = 11 |
| 146 | +//! - Non-ATCG characters are **unsupported**. |
| 147 | +//! - Sequences are stored in Little-Endian order |
| 148 | +//! - The final u64 of sequence data is padded with zeros if the sequence length is not divisible by 32 |
| 149 | +//! |
| 150 | +//! See [`bitnuc`] for 2bit implementation details. |
| 151 | +//! |
| 152 | +//! ## bq implementation Notes |
| 153 | +//! |
| 154 | +//! - Sequences are stored in u64 chunks, each holding up to 32 bases |
| 155 | +//! - Random access to any record can be calculated as: |
| 156 | +//! - record_size = 8 + (ceil(sequence_length/32) \* 8) |
| 157 | +//! - record_start = 16 + (record_index \* record_size) |
| 158 | +//! - Total number of records can be calculated as: (file_size - 16) / record_size |
| 159 | +//! - Flag field placement allows for efficient filtering strategies: |
| 160 | +//! - Records can be skipped based on flag values without reading sequence data |
| 161 | +//! - Flag checks can be vectorized for parallel processing |
| 162 | +//! - Memory access patterns are predictable for better cache utilization |
| 163 | +//! |
| 164 | +//! ## Example Storage Requirements |
| 165 | +//! |
| 166 | +//! Common sequence lengths: |
| 167 | +//! |
| 168 | +//! - 32bp reads: |
| 169 | +//! - Sequence: 1 \* 8 = 8 bytes (fits in one u64) |
| 170 | +//! - Flag: 8 bytes |
| 171 | +//! - Total per record: 16 bytes |
| 172 | +//! - 100bp reads: |
| 173 | +//! - Sequence: 4 \* 8 = 32 bytes (requires four u64s) |
| 174 | +//! - Flag: 8 bytes |
| 175 | +//! - Total per record: 40 bytes |
| 176 | +//! - 150bp reads: |
| 177 | +//! - Sequence: 5 \* 8 = 40 bytes (requires five u64s) |
| 178 | +//! - Flag: 8 bytes |
| 179 | +//! - Total per record: 48 bytes |
| 180 | +//! |
| 181 | +//! ## Validation |
| 182 | +//! |
| 183 | +//! Implementations should verify: |
| 184 | +//! |
| 185 | +//! 1. Correct magic number |
| 186 | +//! 2. Compatible version number |
| 187 | +//! 3. Sequence length is greater than 0 |
| 188 | +//! 4. File size minus header (32 bytes) is divisible by the record size |
| 189 | +//! |
| 190 | +//! ## Future Considerations |
| 191 | +//! |
| 192 | +//! - The 19 reserved bytes in the header allow for future format extensions |
| 193 | +//! - The 64-bit flag field provides space for implementation-specific features such as: |
| 194 | +//! - Quality score summaries |
| 195 | +//! - Filtering flags |
| 196 | +//! - Read group identifiers |
| 197 | +//! - Processing state |
| 198 | +//! - Count data |
| 199 | +
|
1 | 200 | mod header; |
2 | 201 | mod reader; |
3 | 202 | mod utils; |
|
0 commit comments