|
1 | | -//! # VBINSEQ |
| 1 | +//! # VBINSEQ Format |
2 | 2 | //! |
3 | | -//! VBINSEQ is a high-performance binary file format for nucleotides. |
| 3 | +//! VBINSEQ is a high-performance binary format for variable-length nucleotide sequences |
| 4 | +//! that optimizes both storage efficiency and parallel processing capabilities. |
4 | 5 | //! |
5 | | -//! It is a variant of the BINSEQ file format with support for _variable length records_ and _quality scores_. |
| 6 | +//! For more information on the format, please refer to our [preprint](https://www.biorxiv.org/content/10.1101/2025.04.08.647863v1). |
6 | 7 | //! |
7 | 8 | //! ## Overview |
8 | 9 | //! |
9 | | -//! VBINSEQ provides a block-based file format for efficient storage and retrieval of nucleotide sequences. |
10 | | -//! Key features include: |
| 10 | +//! VBINSEQ extends the core principles of BINSEQ to accommodate: |
11 | 11 | //! |
12 | | -//! * **Block-based architecture** - Data is stored in fixed-size record blocks that can be processed independently |
13 | | -//! * **Variable-length records** - Unlike fixed-size records, variable-length records can store sequences of any size |
14 | | -//! * **Quality scores** - Optional quality score tracking for each nucleotide |
15 | | -//! * **Paired sequences** - Support for paired-end sequencing data |
16 | | -//! * **Parallel compression** - Support for ZSTD compression with parallel processing |
17 | | -//! * **Random access** - Efficient random access to record blocks |
| 12 | +//! * **Variable-length sequences**: Unlike BINSEQ which requires fixed-length reads, VBINSEQ can store |
| 13 | +//! sequences of any length, making it suitable for technologies like PacBio and Oxford Nanopore. |
18 | 14 | //! |
19 | | -//! ## Usage |
| 15 | +//! * **Quality scores**: Optional storage of quality scores alongside nucleotide data when needed. |
20 | 16 | //! |
21 | | -//! The two primary interfaces are: |
| 17 | +//! * **Block-based organization**: Data is organized into fixed-size independent record blocks |
| 18 | +//! for efficient parallel processing. |
22 | 19 | //! |
23 | | -//! * `VBinseqWriter` - For writing nucleotide sequences to a VBINSEQ file |
24 | | -//! * `MmapReader` - For memory-mapped reading of VBINSEQ files |
| 20 | +//! * **Compression**: Optional ZSTD compression of individual blocks balances storage |
| 21 | +//! efficiency with processing speed. |
25 | 22 | //! |
26 | | -//! ### Writing to a VBINSEQ file |
| 23 | +//! * **Paired-end support**: Native support for paired sequences without needing multiple files. |
27 | 24 | //! |
28 | | -//! ```rust |
| 25 | +//! ## File Structure |
| 26 | +//! |
| 27 | +//! A VBINSEQ file consists of a 32-byte header followed by a series of record blocks. |
| 28 | +//! Each block has a 32-byte header and contains one or more variable-length records. |
| 29 | +//! |
| 30 | +//! ```text |
| 31 | +//! ┌───────────────────┐ |
| 32 | +//! │ File Header │ 32 bytes |
| 33 | +//! ├───────────────────┤ |
| 34 | +//! │ Block Header │ 32 bytes |
| 35 | +//! ├───────────────────┤ |
| 36 | +//! │ │ |
| 37 | +//! │ Block Records │ Variable size |
| 38 | +//! │ │ |
| 39 | +//! ├───────────────────┤ |
| 40 | +//! │ Block Header │ 32 bytes |
| 41 | +//! ├───────────────────┤ |
| 42 | +//! │ │ |
| 43 | +//! │ Block Records │ Variable size |
| 44 | +//! │ │ |
| 45 | +//! └───────────────────┘ |
| 46 | +//! ``` |
| 47 | +//! |
| 48 | +//! ## Record Format |
| 49 | +//! |
| 50 | +//! Each record contains: |
| 51 | +//! |
| 52 | +//! * Flag field (8 bytes) |
| 53 | +//! * Primary sequence length (8 bytes) |
| 54 | +//! * Extended sequence length (8 bytes) |
| 55 | +//! * Primary sequence data (2-bit encoded) |
| 56 | +//! * Primary quality scores (optional) |
| 57 | +//! * Extended sequence data (optional, for paired-end) |
| 58 | +//! * Extended quality scores (optional) |
| 59 | +//! |
| 60 | +//! ## Performance Characteristics |
| 61 | +//! |
| 62 | +//! VBINSEQ is designed for high-throughput parallel processing: |
| 63 | +//! |
| 64 | +//! * Independent blocks enable true parallel processing without synchronization |
| 65 | +//! * Memory-mapped access provides efficient I/O |
| 66 | +//! * 2-bit encoding reduces storage requirements |
| 67 | +//! * Optional ZSTD compression reduces file size with minimal performance impact |
| 68 | +//! |
| 69 | +//! ## Usage Example |
| 70 | +//! |
| 71 | +//! ``` |
29 | 72 | //! use std::fs::File; |
30 | 73 | //! use std::io::BufWriter; |
31 | 74 | //! use binseq::vbq::{VBinseqHeader, VBinseqWriterBuilder, MmapReader}; |
32 | 75 | //! use binseq::BinseqRecord; |
33 | 76 | //! |
34 | | -//! // Path to the output file |
35 | | -//! let path_name = "some_example.vbq"; |
36 | | -//! |
37 | | -//! // Create a header with quality scores and compression enabled |
38 | | -//! let header = VBinseqHeader::new(true, true, false); |
| 77 | +//! /* |
| 78 | +//! WRITING |
| 79 | +//! */ |
39 | 80 | //! |
40 | | -//! // Open a file for writing |
41 | | -//! let handle = File::create(path_name).map(BufWriter::new).unwrap(); |
| 81 | +//! // Create a header for sequences with quality scores |
| 82 | +//! let with_qual = true; |
| 83 | +//! let compressed = true; |
| 84 | +//! let paired = false; |
| 85 | +//! let header = VBinseqHeader::new(with_qual, compressed, paired); |
42 | 86 | //! |
43 | | -//! // Create a writer with the specified header |
| 87 | +//! // Create a writer for sequences with quality scores |
| 88 | +//! let file = File::create("example.vbq").unwrap(); |
44 | 89 | //! let mut writer = VBinseqWriterBuilder::default() |
45 | 90 | //! .header(header) |
46 | | -//! .build(handle) |
| 91 | +//! .build(BufWriter::new(file)) |
47 | 92 | //! .unwrap(); |
48 | 93 | //! |
49 | | -//! // Write a nucleotide sequence with quality scores |
| 94 | +//! // Write a sequence with quality scores |
50 | 95 | //! let sequence = b"ACGTACGT"; |
51 | | -//! let quality = b"!!!?!?!!"; |
| 96 | +//! let quality = b"IIIIFFFF"; |
52 | 97 | //! writer.write_nucleotides_quality(0, sequence, quality).unwrap(); |
53 | 98 | //! writer.finish().unwrap(); |
54 | 99 | //! |
55 | | -//! // Open a file for memory-mapped reading |
56 | | -//! let mut reader = MmapReader::new(path_name).unwrap(); |
| 100 | +//! /* |
| 101 | +//! READING |
| 102 | +//! */ |
| 103 | +//! |
| 104 | +//! // Read the sequences back |
| 105 | +//! let mut reader = MmapReader::new("example.vbq").unwrap(); |
57 | 106 | //! let mut block = reader.new_block(); |
58 | 107 | //! |
59 | 108 | //! // Process blocks one at a time |
60 | 109 | //! let mut seq_buffer = Vec::new(); |
61 | 110 | //! while reader.read_block_into(&mut block).unwrap() { |
62 | 111 | //! for record in block.iter() { |
63 | | -//! // Decode the sequence |
64 | 112 | //! record.decode_s(&mut seq_buffer).unwrap(); |
65 | | -//! println!("Sequence {}: {}", record.index(), std::str::from_utf8(&seq_buffer).unwrap()); |
66 | | -//! |
67 | | -//! // Validate the sequence and quality scores |
68 | | -//! assert_eq!(seq_buffer, sequence); |
69 | | -//! assert_eq!(record.squal(), quality); |
70 | | -//! |
71 | | -//! seq_buffer.clear(); // Clear the buffer for the next sequence |
| 113 | +//! println!("Sequence: {}", std::str::from_utf8(&seq_buffer).unwrap()); |
| 114 | +//! println!("Quality: {}", std::str::from_utf8(record.squal()).unwrap()); |
| 115 | +//! seq_buffer.clear(); |
72 | 116 | //! } |
73 | 117 | //! } |
74 | | -//! |
75 | | -//! // Delete the temporary file (for testing purposes) |
76 | | -//! std::fs::remove_file(path_name).unwrap(); |
| 118 | +//! # std::fs::remove_file("example.vbq").unwrap_or(()); |
77 | 119 | //! ``` |
78 | | -//! |
79 | | -//! ## File Format Structure |
80 | | -//! |
81 | | -//! The VBINSEQ file format consists of: |
82 | | -//! |
83 | | -//! 1. A file header (32 bytes) containing format information |
84 | | -//! 2. A series of record blocks, each containing: |
85 | | -//! - Block header (32 bytes) |
86 | | -//! - Block data (variable size, containing records) |
87 | | -//! - Block padding (to maintain fixed virtual block size) |
88 | | -//! |
89 | | -//! Each record contains a preamble with metadata and data containing encoded sequences and quality scores. |
90 | | -//! |
91 | | -//! See the README.md for detailed format specifications. |
92 | 120 |
|
93 | 121 | pub mod header; |
94 | 122 | pub mod index; |
|
0 commit comments