Skip to content

Commit 8ca92a5

Browse files
committed
docs: improve documentation on vbq
1 parent ae14b44 commit 8ca92a5

1 file changed

Lines changed: 83 additions & 55 deletions

File tree

src/vbq/mod.rs

Lines changed: 83 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1,94 +1,122 @@
1-
//! # VBINSEQ
1+
//! # VBINSEQ Format
22
//!
3-
//! VBINSEQ is a high-performance binary file format for nucleotides.
3+
//! VBINSEQ is a high-performance binary format for variable-length nucleotide sequences
4+
//! that optimizes both storage efficiency and parallel processing capabilities.
45
//!
5-
//! It is a variant of the BINSEQ file format with support for _variable length records_ and _quality scores_.
6+
//! For more information on the format, please refer to our [preprint](https://www.biorxiv.org/content/10.1101/2025.04.08.647863v1).
67
//!
78
//! ## Overview
89
//!
9-
//! VBINSEQ provides a block-based file format for efficient storage and retrieval of nucleotide sequences.
10-
//! Key features include:
10+
//! VBINSEQ extends the core principles of BINSEQ to accommodate:
1111
//!
12-
//! * **Block-based architecture** - Data is stored in fixed-size record blocks that can be processed independently
13-
//! * **Variable-length records** - Unlike fixed-size records, variable-length records can store sequences of any size
14-
//! * **Quality scores** - Optional quality score tracking for each nucleotide
15-
//! * **Paired sequences** - Support for paired-end sequencing data
16-
//! * **Parallel compression** - Support for ZSTD compression with parallel processing
17-
//! * **Random access** - Efficient random access to record blocks
12+
//! * **Variable-length sequences**: Unlike BINSEQ which requires fixed-length reads, VBINSEQ can store
13+
//! sequences of any length, making it suitable for technologies like PacBio and Oxford Nanopore.
1814
//!
19-
//! ## Usage
15+
//! * **Quality scores**: Optional storage of quality scores alongside nucleotide data when needed.
2016
//!
21-
//! The two primary interfaces are:
17+
//! * **Block-based organization**: Data is organized into fixed-size independent record blocks
18+
//! for efficient parallel processing.
2219
//!
23-
//! * `VBinseqWriter` - For writing nucleotide sequences to a VBINSEQ file
24-
//! * `MmapReader` - For memory-mapped reading of VBINSEQ files
20+
//! * **Compression**: Optional ZSTD compression of individual blocks balances storage
21+
//! efficiency with processing speed.
2522
//!
26-
//! ### Writing to a VBINSEQ file
23+
//! * **Paired-end support**: Native support for paired sequences without needing multiple files.
2724
//!
28-
//! ```rust
25+
//! ## File Structure
26+
//!
27+
//! A VBINSEQ file consists of a 32-byte header followed by a series of record blocks.
28+
//! Each block has a 32-byte header and contains one or more variable-length records.
29+
//!
30+
//! ```text
31+
//! ┌───────────────────┐
32+
//! │ File Header │ 32 bytes
33+
//! ├───────────────────┤
34+
//! │ Block Header │ 32 bytes
35+
//! ├───────────────────┤
36+
//! │ │
37+
//! │ Block Records │ Variable size
38+
//! │ │
39+
//! ├───────────────────┤
40+
//! │ Block Header │ 32 bytes
41+
//! ├───────────────────┤
42+
//! │ │
43+
//! │ Block Records │ Variable size
44+
//! │ │
45+
//! └───────────────────┘
46+
//! ```
47+
//!
48+
//! ## Record Format
49+
//!
50+
//! Each record contains:
51+
//!
52+
//! * Flag field (8 bytes)
53+
//! * Primary sequence length (8 bytes)
54+
//! * Extended sequence length (8 bytes)
55+
//! * Primary sequence data (2-bit encoded)
56+
//! * Primary quality scores (optional)
57+
//! * Extended sequence data (optional, for paired-end)
58+
//! * Extended quality scores (optional)
59+
//!
60+
//! ## Performance Characteristics
61+
//!
62+
//! VBINSEQ is designed for high-throughput parallel processing:
63+
//!
64+
//! * Independent blocks enable true parallel processing without synchronization
65+
//! * Memory-mapped access provides efficient I/O
66+
//! * 2-bit encoding reduces storage requirements
67+
//! * Optional ZSTD compression reduces file size with minimal performance impact
68+
//!
69+
//! ## Usage Example
70+
//!
71+
//! ```
2972
//! use std::fs::File;
3073
//! use std::io::BufWriter;
3174
//! use binseq::vbq::{VBinseqHeader, VBinseqWriterBuilder, MmapReader};
3275
//! use binseq::BinseqRecord;
3376
//!
34-
//! // Path to the output file
35-
//! let path_name = "some_example.vbq";
36-
//!
37-
//! // Create a header with quality scores and compression enabled
38-
//! let header = VBinseqHeader::new(true, true, false);
77+
//! /*
78+
//! WRITING
79+
//! */
3980
//!
40-
//! // Open a file for writing
41-
//! let handle = File::create(path_name).map(BufWriter::new).unwrap();
81+
//! // Create a header for sequences with quality scores
82+
//! let with_qual = true;
83+
//! let compressed = true;
84+
//! let paired = false;
85+
//! let header = VBinseqHeader::new(with_qual, compressed, paired);
4286
//!
43-
//! // Create a writer with the specified header
87+
//! // Create a writer for sequences with quality scores
88+
//! let file = File::create("example.vbq").unwrap();
4489
//! let mut writer = VBinseqWriterBuilder::default()
4590
//! .header(header)
46-
//! .build(handle)
91+
//! .build(BufWriter::new(file))
4792
//! .unwrap();
4893
//!
49-
//! // Write a nucleotide sequence with quality scores
94+
//! // Write a sequence with quality scores
5095
//! let sequence = b"ACGTACGT";
51-
//! let quality = b"!!!?!?!!";
96+
//! let quality = b"IIIIFFFF";
5297
//! writer.write_nucleotides_quality(0, sequence, quality).unwrap();
5398
//! writer.finish().unwrap();
5499
//!
55-
//! // Open a file for memory-mapped reading
56-
//! let mut reader = MmapReader::new(path_name).unwrap();
100+
//! /*
101+
//! READING
102+
//! */
103+
//!
104+
//! // Read the sequences back
105+
//! let mut reader = MmapReader::new("example.vbq").unwrap();
57106
//! let mut block = reader.new_block();
58107
//!
59108
//! // Process blocks one at a time
60109
//! let mut seq_buffer = Vec::new();
61110
//! while reader.read_block_into(&mut block).unwrap() {
62111
//! for record in block.iter() {
63-
//! // Decode the sequence
64112
//! record.decode_s(&mut seq_buffer).unwrap();
65-
//! println!("Sequence {}: {}", record.index(), std::str::from_utf8(&seq_buffer).unwrap());
66-
//!
67-
//! // Validate the sequence and quality scores
68-
//! assert_eq!(seq_buffer, sequence);
69-
//! assert_eq!(record.squal(), quality);
70-
//!
71-
//! seq_buffer.clear(); // Clear the buffer for the next sequence
113+
//! println!("Sequence: {}", std::str::from_utf8(&seq_buffer).unwrap());
114+
//! println!("Quality: {}", std::str::from_utf8(record.squal()).unwrap());
115+
//! seq_buffer.clear();
72116
//! }
73117
//! }
74-
//!
75-
//! // Delete the temporary file (for testing purposes)
76-
//! std::fs::remove_file(path_name).unwrap();
118+
//! # std::fs::remove_file("example.vbq").unwrap_or(());
77119
//! ```
78-
//!
79-
//! ## File Format Structure
80-
//!
81-
//! The VBINSEQ file format consists of:
82-
//!
83-
//! 1. A file header (32 bytes) containing format information
84-
//! 2. A series of record blocks, each containing:
85-
//! - Block header (32 bytes)
86-
//! - Block data (variable size, containing records)
87-
//! - Block padding (to maintain fixed virtual block size)
88-
//!
89-
//! Each record contains a preamble with metadata and data containing encoded sequences and quality scores.
90-
//!
91-
//! See the README.md for detailed format specifications.
92120
93121
pub mod header;
94122
pub mod index;

0 commit comments

Comments
 (0)