Skip to content

Commit d5efd7c

Browse files
authored
Merge pull request #34 from noamteyssier/33-improve-documentation-throughout-repo
33 improve documentation throughout repo
2 parents 4ecafb7 + d249d5c commit d5efd7c

9 files changed

Lines changed: 908 additions & 74 deletions

File tree

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "binseq"
3-
version = "0.5.3"
3+
version = "0.5.4"
44
edition = "2021"
55

66
[dependencies]

src/error.rs

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,47 +1,101 @@
1+
/// Custom Result type for binseq operations, wrapping the custom [`Error`] type
12
pub type Result<T> = std::result::Result<T, Error>;
23

4+
/// The main error type for the binseq library, encompassing all possible error cases
5+
/// that can occur during binary sequence operations.
36
#[derive(thiserror::Error, Debug)]
47
#[error(transparent)]
58
pub enum Error {
9+
/// Errors related to binary sequence header processing
610
HeaderError(#[from] HeaderError),
11+
/// Errors that occur during read operations
712
ReadError(#[from] ReadError),
13+
/// Errors that occur during write operations
814
WriteError(#[from] WriteError),
15+
/// Standard I/O errors from the Rust standard library
916
IoError(#[from] std::io::Error),
17+
/// UTF-8 encoding/decoding errors
1018
Utf8Error(#[from] std::str::Utf8Error),
19+
/// Errors from the bitnuc nucleotide processing librar
1120
BitnucError(#[from] bitnuc::NucleotideError),
21+
/// Generic errors that can occur in any part of the system
1222
AnyhowError(#[from] anyhow::Error),
1323
}
1424

25+
/// Errors specific to processing and validating binary sequence headers
1526
#[derive(thiserror::Error, Debug)]
1627
pub enum HeaderError {
28+
/// The magic number in the header does not match the expected value
29+
///
30+
/// # Arguments
31+
/// * `u32` - The invalid magic number that was found
1732
#[error("Invalid magic number: {0}")]
1833
InvalidMagicNumber(u32),
34+
35+
/// The format version in the header is not supported
36+
///
37+
/// # Arguments
38+
/// * `u8` - The unsupported version number that was found
1939
#[error("Invalid format version: {0}")]
2040
InvalidFormatVersion(u8),
41+
42+
/// The reserved bytes in the header contain unexpected values
2143
#[error("Invalid reserved bytes")]
2244
InvalidReservedBytes,
45+
46+
/// The size of the data does not match what was specified in the header
47+
///
48+
/// # Arguments
49+
/// * First `usize` - The actual number of bytes provided
50+
/// * Second `usize` - The expected number of bytes according to the header
2351
#[error("Invalid number of bytes provided: {0}. Expected: {1}")]
2452
InvalidSize(usize, usize),
2553
}
2654

55+
/// Errors that can occur while reading binary sequence data
2756
#[derive(thiserror::Error, Debug)]
2857
pub enum ReadError {
58+
/// The file being read is not a regular file (e.g., it might be a directory or special file)
2959
#[error("File is not regular")]
3060
IncompatibleFile,
61+
62+
/// The file appears to be truncated or corrupted
63+
///
64+
/// # Arguments
65+
/// * `usize` - The byte position where the truncation was detected
3166
#[error(
3267
"Number of bytes in file does not match expectation - possibly truncated at byte pos {0}"
3368
)]
3469
FileTruncation(usize),
70+
71+
/// Attempted to access a record index that is beyond the available range
72+
///
73+
/// # Arguments
74+
/// * First `usize` - The requested record index
75+
/// * Second `usize` - The maximum available record index
3576
#[error("Requested record index ({0}) is out of record range ({1})")]
3677
OutOfRange(usize, usize),
3778
}
3879

80+
/// Errors that can occur while writing binary sequence data
3981
#[derive(thiserror::Error, Debug)]
4082
pub enum WriteError {
83+
/// The length of the sequence being written does not match what was specified in the header
84+
///
85+
/// # Fields
86+
/// * `expected` - The sequence length specified in the header
87+
/// * `got` - The actual length of the sequence being written
4188
#[error("Sequence length ({got}) does not match the header ({expected})")]
4289
UnexpectedSequenceLength { expected: u32, got: usize },
90+
91+
/// The sequence contains invalid nucleotide characters
92+
///
93+
/// # Arguments
94+
/// * `String` - Description of the invalid nucleotides found
4395
#[error("Invalid nucleotides found in sequence: {0}")]
4496
InvalidNucleotideSequence(String),
97+
98+
/// Attempted to write data without first setting up the header
4599
#[error("Missing header in writer builder")]
46100
MissingHeader,
47101
}

src/header.rs

Lines changed: 123 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,36 @@
1+
//! Header module for the binseq library
2+
//!
3+
//! This module provides the header structure and functionality for binary sequence files.
4+
//! The header contains metadata about the binary sequence data, including format version,
5+
//! sequence length, and other information necessary for proper interpretation of the data.
6+
17
use byteorder::{ByteOrder, LittleEndian};
28
use std::io::{Read, Write};
39

410
use crate::{error::Result, HeaderError};
511

6-
/// Current magic number: "BSEQ" in ASCII
12+
/// Current magic number: "BSEQ" in ASCII (in little-endian byte order)
13+
///
14+
/// This is used to identify binary sequence files and verify file integrity.
715
const MAGIC: u32 = 0x51455342;
816

9-
/// Current format version
17+
/// Current format version of the binary sequence file format
18+
///
19+
/// This version number allows for future format changes while maintaining backward compatibility.
1020
const FORMAT: u8 = 1;
1121

1222
/// Size of the header in bytes
23+
///
24+
/// The header has a fixed size to ensure consistent reading and writing of binary sequence files.
1325
pub const SIZE_HEADER: usize = 32;
1426

27+
/// Header structure for binary sequence files
28+
///
29+
/// The `BinseqHeader` contains metadata about the binary sequence data stored in a file,
30+
/// including format information, sequence lengths, and space for future extensions.
31+
///
32+
/// The total size of this structure is 32 bytes, with a fixed layout to ensure
33+
/// consistent reading and writing across different platforms.
1534
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1635
pub struct BinseqHeader {
1736
/// Magic number to identify the file format
@@ -40,6 +59,19 @@ pub struct BinseqHeader {
4059
pub reserved: [u8; 19],
4160
}
4261
impl BinseqHeader {
62+
/// Creates a new header with the specified sequence length
63+
///
64+
/// This constructor initializes a standard header with the given sequence length,
65+
/// setting the magic number and format version to their default values.
66+
/// The extended sequence length (xlen) is set to 0.
67+
///
68+
/// # Arguments
69+
///
70+
/// * `slen` - The length of sequences in the file
71+
///
72+
/// # Returns
73+
///
74+
/// A new `BinseqHeader` instance
4375
pub fn new(slen: u32) -> Self {
4476
Self {
4577
magic: MAGIC,
@@ -50,6 +82,19 @@ impl BinseqHeader {
5082
}
5183
}
5284

85+
/// Creates a new header with both primary and extended sequence lengths
86+
///
87+
/// This constructor initializes a header for files that contain both primary
88+
/// and secondary sequence data, such as quality scores or annotations.
89+
///
90+
/// # Arguments
91+
///
92+
/// * `slen` - The length of primary sequences in the file
93+
/// * `xlen` - The length of secondary/extended sequences in the file
94+
///
95+
/// # Returns
96+
///
97+
/// A new `BinseqHeader` instance with extended sequence information
5398
pub fn new_extended(slen: u32, xlen: u32) -> Self {
5499
Self {
55100
magic: MAGIC,
@@ -60,6 +105,26 @@ impl BinseqHeader {
60105
}
61106
}
62107

108+
/// Parses a header from a fixed-size byte array
109+
///
110+
/// This method validates the magic number and format version before constructing
111+
/// a header instance. If validation fails, appropriate errors are returned.
112+
///
113+
/// # Arguments
114+
///
115+
/// * `buffer` - A byte array of exactly `SIZE_HEADER` bytes containing the header data
116+
///
117+
/// # Returns
118+
///
119+
/// * `Ok(BinseqHeader)` - A valid header parsed from the buffer
120+
/// * `Err(Error)` - If the buffer contains invalid header data
121+
///
122+
/// # Errors
123+
///
124+
/// Returns an error if:
125+
/// * The magic number is incorrect
126+
/// * The format version is unsupported
127+
/// * The reserved bytes are invalid
63128
pub fn from_bytes(buffer: &[u8; SIZE_HEADER]) -> Result<Self> {
64129
let magic = LittleEndian::read_u32(&buffer[0..4]);
65130
if magic != MAGIC {
@@ -84,7 +149,26 @@ impl BinseqHeader {
84149
})
85150
}
86151

87-
/// Parses an arbitrarily sized buffer
152+
/// Parses a header from an arbitrarily sized buffer
153+
///
154+
/// This method extracts the header from the beginning of a buffer that may be larger
155+
/// than the header size. It checks that the buffer is at least as large as the header
156+
/// before attempting to parse it.
157+
///
158+
/// # Arguments
159+
///
160+
/// * `buffer` - A byte slice containing at least `SIZE_HEADER` bytes
161+
///
162+
/// # Returns
163+
///
164+
/// * `Ok(BinseqHeader)` - A valid header parsed from the buffer
165+
/// * `Err(Error)` - If the buffer is too small or contains invalid header data
166+
///
167+
/// # Errors
168+
///
169+
/// Returns an error if:
170+
/// * The buffer is smaller than `SIZE_HEADER`
171+
/// * The header data is invalid (see `from_bytes` for validation details)
88172
pub fn from_buffer(buffer: &[u8]) -> Result<Self> {
89173
let mut bytes = [0u8; SIZE_HEADER];
90174
if buffer.len() < SIZE_HEADER {
@@ -94,6 +178,23 @@ impl BinseqHeader {
94178
Self::from_bytes(&bytes)
95179
}
96180

181+
/// Writes the header to a writer
182+
///
183+
/// This method serializes the header to its binary representation and writes it
184+
/// to the provided writer.
185+
///
186+
/// # Arguments
187+
///
188+
/// * `writer` - Any type that implements the `Write` trait
189+
///
190+
/// # Returns
191+
///
192+
/// * `Ok(())` - If the header was successfully written
193+
/// * `Err(Error)` - If writing to the writer failed
194+
///
195+
/// # Errors
196+
///
197+
/// Returns an error if writing to the writer fails (typically an I/O error).
97198
pub fn write_bytes<W: Write>(&self, writer: &mut W) -> Result<()> {
98199
let mut buffer = [0u8; SIZE_HEADER];
99200
LittleEndian::write_u32(&mut buffer[0..4], self.magic);
@@ -105,6 +206,25 @@ impl BinseqHeader {
105206
Ok(())
106207
}
107208

209+
/// Reads a header from a reader
210+
///
211+
/// This method reads exactly `SIZE_HEADER` bytes from the provided reader and
212+
/// parses them into a header structure.
213+
///
214+
/// # Arguments
215+
///
216+
/// * `reader` - Any type that implements the `Read` trait
217+
///
218+
/// # Returns
219+
///
220+
/// * `Ok(BinseqHeader)` - A valid header read from the reader
221+
/// * `Err(Error)` - If reading from the reader failed or the header data is invalid
222+
///
223+
/// # Errors
224+
///
225+
/// Returns an error if:
226+
/// * Reading from the reader fails (typically an I/O error)
227+
/// * The header data is invalid (see `from_bytes` for validation details)
108228
pub fn from_reader<R: Read>(reader: &mut R) -> Result<Self> {
109229
let mut buffer = [0u8; SIZE_HEADER];
110230
reader.read_exact(&mut buffer)?;

src/lib.rs

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,46 @@
1+
#![doc = include_str!("../README.md")]
2+
//!
3+
//! # Overview
4+
//!
5+
//! The `binseq` library provides efficient tools for working with binary-encoded
6+
//! nucleotide sequences. It offers:
7+
//!
8+
//! - Compact 2-bit encoding of nucleotide sequences
9+
//! - Memory-mapped file access for efficient reading
10+
//! - Parallel processing capabilities
11+
//! - Configurable policies for handling invalid nucleotides
12+
//! - Support for both single and paired-end sequences
13+
//!
14+
//! # Core Components
15+
//!
16+
//! - [`BinseqWriter`]: Writes sequences to binary format
17+
//! - [`MmapReader`]: Reads sequences using memory mapping
18+
//! - [`BinseqHeader`]: Defines file format and sequence lengths
19+
//! - [`Policy`]: Configures invalid nucleotide handling
20+
//! - [`ParallelProcessor`]: Enables parallel sequence processing
21+
//!
22+
//! # Example
23+
//!
24+
//! ```
25+
//! use binseq::{BinseqHeader, BinseqWriterBuilder, MmapReader, Policy, Result};
26+
//! use std::io::Cursor;
27+
//!
28+
//! fn main() -> Result<()> {
29+
//! // Create a writer for sequences of length 100
30+
//! let header = BinseqHeader::new(100);
31+
//!
32+
//! let mut writer = BinseqWriterBuilder::default()
33+
//! .header(header)
34+
//! .build(Cursor::new(Vec::new()))?;
35+
//!
36+
//! // Write a sequence
37+
//! let sequence = b"ACGT".repeat(25); // 100 nucleotides
38+
//! writer.write_nucleotides(0, &sequence)?;
39+
//!
40+
//! Ok(())
41+
//! }
42+
//! ```
43+
144
#![allow(clippy::module_inception)]
245

346
pub mod error;
@@ -14,7 +57,7 @@ pub use parallel::ParallelProcessor;
1457
pub use policy::{Policy, RNG_SEED};
1558
pub use reader::{MmapReader, RefRecord};
1659
pub use utils::expected_file_size;
17-
pub use writer::{BinseqWriter, BinseqWriterBuilder};
60+
pub use writer::{BinseqWriter, BinseqWriterBuilder, Encoder};
1861

1962
// #[cfg(test)]
2063
// mod testing {

0 commit comments

Comments
 (0)