Skip to content

Commit cdc5b7a

Browse files
authored
Merge pull request #87 from ArcInstitute/remove-vbq-external-index-code
Remove vbq external index code
2 parents 8564733 + 3c55918 commit cdc5b7a

6 files changed

Lines changed: 122 additions & 259 deletions

File tree

src/error.rs

Lines changed: 0 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -60,24 +60,6 @@ pub enum Error {
6060
#[error("Fastx encoding error: {0}")]
6161
FastxEncodingError(#[from] FastxEncodingError),
6262
}
63-
impl Error {
64-
/// Checks if the error is an index mismatch error
65-
///
66-
/// This is useful for determining if a file's index is out of sync with its content,
67-
/// which might require rebuilding the index.
68-
///
69-
/// # Returns
70-
///
71-
/// * `true` if the error is an `IndexError::ByteSizeMismatch`
72-
/// * `false` for all other error types
73-
#[must_use]
74-
pub fn is_index_mismatch(&self) -> bool {
75-
match self {
76-
Self::IndexError(err) => err.is_mismatch(),
77-
_ => false,
78-
}
79-
}
80-
}
8163

8264
/// Errors specific to processing and validating binary sequence headers
8365
#[derive(thiserror::Error, Debug)]
@@ -284,36 +266,10 @@ pub enum IndexError {
284266
#[error("Invalid magic number: {0}")]
285267
InvalidMagicNumber(u64),
286268

287-
/// When the index references a file that doesn't exist
288-
///
289-
/// The parameter is the missing file path
290-
#[error("Index missing upstream file path: {0}")]
291-
MissingUpstreamFile(String),
292-
293-
/// When the size of the file doesn't match what the index expects
294-
///
295-
/// The first parameter is the actual file size, the second is the expected size
296-
#[error("Mismatch in size between upstream size: {0} and expected index size {1}")]
297-
ByteSizeMismatch(u64, u64),
298-
299269
/// Invalid reserved bytes in the index header
300270
#[error("Invalid reserved bytes in index header")]
301271
InvalidReservedBytes,
302272
}
303-
impl IndexError {
304-
/// Checks if this error indicates a mismatch between the index and file
305-
///
306-
/// This is useful to determine if the index needs to be rebuilt.
307-
///
308-
/// # Returns
309-
///
310-
/// * `true` for `ByteSizeMismatch` errors
311-
/// * `true` for any other error type (this behavior is likely a bug and should be fixed)
312-
#[must_use]
313-
pub fn is_mismatch(&self) -> bool {
314-
matches!(self, Self::ByteSizeMismatch(_, _) | _) // Note: this appears to always return true regardless of error type
315-
}
316-
}
317273

318274
#[derive(thiserror::Error, Debug)]
319275
pub enum CbqError {
@@ -411,55 +367,6 @@ mod testing {
411367
assert!(matches!(binseq_error, Error::GenericError(_)));
412368
}
413369

414-
// ==================== Error::is_index_mismatch Tests ====================
415-
416-
#[test]
417-
fn test_is_index_mismatch_with_byte_size_mismatch() {
418-
let error = Error::IndexError(IndexError::ByteSizeMismatch(100, 200));
419-
assert!(error.is_index_mismatch());
420-
}
421-
422-
#[test]
423-
fn test_is_index_mismatch_with_invalid_magic() {
424-
let error = Error::IndexError(IndexError::InvalidMagicNumber(0x1234));
425-
// Note: The current implementation has a bug - it always returns true
426-
assert!(error.is_index_mismatch());
427-
}
428-
429-
#[test]
430-
fn test_is_index_mismatch_with_non_index_error() {
431-
let error = Error::WriteError(WriteError::MissingHeader);
432-
assert!(!error.is_index_mismatch());
433-
}
434-
435-
// ==================== IndexError Tests ====================
436-
437-
#[test]
438-
fn test_index_error_is_mismatch() {
439-
let error = IndexError::ByteSizeMismatch(100, 200);
440-
assert!(error.is_mismatch());
441-
}
442-
443-
#[test]
444-
fn test_index_error_invalid_magic() {
445-
let error = IndexError::InvalidMagicNumber(0x1234);
446-
// Note: Current implementation bug - always returns true
447-
assert!(error.is_mismatch());
448-
}
449-
450-
#[test]
451-
fn test_index_error_missing_upstream_file() {
452-
let error = IndexError::MissingUpstreamFile("test.vbq".to_string());
453-
assert!(error.is_mismatch());
454-
assert!(format!("{}", error).contains("test.vbq"));
455-
}
456-
457-
#[test]
458-
fn test_index_error_invalid_reserved_bytes() {
459-
let error = IndexError::InvalidReservedBytes;
460-
assert!(error.is_mismatch());
461-
}
462-
463370
// ==================== HeaderError Tests ====================
464371

465372
#[test]

src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
//! The VBQ format has undergone significant improvements:
3131
//!
3232
//! - **Embedded Index**: VBQ files now contain their index data embedded at the end of the file,
33-
//! eliminating separate `.vqi` index files and improving portability.
33+
//! improving portability.
3434
//! - **Headers Support**: Optional sequence identifiers/headers can be stored with each record.
3535
//! - **Extended Capacity**: u64 indexing supports files with more than 4 billion records.
3636
//! - **Multi-bit Encoding**: Support for both 2-bit and 4-bit nucleotide encodings.

src/vbq/index.rs

Lines changed: 18 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,8 @@
44
//!
55
//! ## Format Changes (v0.7.0+)
66
//!
7-
//! **BREAKING CHANGE**: The VBQ index is now embedded at the end of VBQ files instead of
8-
//! being stored in separate `.vqi` files. This improves portability and eliminates the
9-
//! need to manage auxiliary files.
7+
//! **BREAKING CHANGE**: The VBQ index is now embedded at the end of VBQ files,
8+
//! improving portability and eliminating the need to manage auxiliary files.
109
//!
1110
//! ## Embedded Index Structure
1211
//!
@@ -29,13 +28,13 @@
2928
//!
3029
//! ## Key Changes from v0.6.x
3130
//!
32-
//! - Index moved from separate `.vqi` files into VBQ files
31+
//! - Index is now embedded in VBQ files
3332
//! - Cumulative record counts changed from `u32` to `u64`
3433
//! - Support for files with more than 4 billion records
3534
3635
use std::{
3736
fs::File,
38-
io::{BufReader, BufWriter, Cursor, Read, Write},
37+
io::{Cursor, Read, Write},
3938
path::Path,
4039
};
4140

@@ -374,9 +373,10 @@ impl IndexHeader {
374373
/// `IndexHeader` and a collection of `BlockRange` entries, one for each block in
375374
/// the file.
376375
///
377-
/// The index can be created by scanning a VBQ file or loaded from a previously
378-
/// created index file. Once loaded, it provides information about block locations,
379-
/// sizes, and record counts.
376+
/// The index is embedded at the end of VBQ files and can be loaded using
377+
/// `MmapReader::load_index()` or created by scanning a VBQ file using
378+
/// `BlockIndex::from_vbq()`. Once loaded, it provides information about block
379+
/// locations, sizes, and record counts.
380380
///
381381
/// # Examples
382382
///
@@ -388,10 +388,6 @@ impl IndexHeader {
388388
/// let vbq_path = Path::new("example.vbq");
389389
/// let index = BlockIndex::from_vbq(vbq_path).unwrap();
390390
///
391-
/// // Save the index for future use
392-
/// let index_path = Path::new("example.vbq.vqi");
393-
/// index.save_to_path(index_path).unwrap();
394-
///
395391
/// // Use the index with a reader for parallel processing
396392
/// let reader = MmapReader::new(vbq_path).unwrap();
397393
/// println!("File contains {} blocks", index.n_blocks());
@@ -430,54 +426,18 @@ impl BlockIndex {
430426
/// # Examples
431427
///
432428
/// ```rust,no_run
433-
/// use binseq::vbq::BlockIndex;
429+
/// use binseq::vbq::{BlockIndex, MmapReader};
434430
/// use std::path::Path;
435431
///
436-
/// let index = BlockIndex::from_path(Path::new("example.vbq.vqi")).unwrap();
432+
/// let reader = MmapReader::new(Path::new("example.vbq")).unwrap();
433+
/// let index = reader.load_index().unwrap();
437434
/// println!("The file contains {} blocks", index.n_blocks());
438435
/// ```
439436
#[must_use]
440437
pub fn n_blocks(&self) -> usize {
441438
self.ranges.len()
442439
}
443440

444-
/// Writes the collection of `BlockRange` to a file
445-
/// Saves the index to a file
446-
///
447-
/// This writes the index header and all block ranges to a file, which can be loaded
448-
/// later to avoid rescanning the VBQ file. The index is compressed to reduce
449-
/// storage space.
450-
///
451-
/// # Parameters
452-
///
453-
/// * `path` - The path where the index file should be saved
454-
///
455-
/// # Returns
456-
///
457-
/// * `Ok(())` - If the index was successfully saved
458-
/// * `Err(_)` - If an error occurred during saving
459-
///
460-
/// # Examples
461-
///
462-
/// ```rust,no_run
463-
/// use binseq::vbq::BlockIndex;
464-
/// use std::path::Path;
465-
///
466-
/// // Create an index from a VBQ file
467-
/// let index = BlockIndex::from_vbq(Path::new("example.vbq")).unwrap();
468-
///
469-
/// // Save it for future use
470-
/// index.save_to_path(Path::new("example.vbq.vqi")).unwrap();
471-
/// ```
472-
pub fn save_to_path<P: AsRef<Path>>(&self, path: P) -> Result<()> {
473-
let mut writer = File::create(path).map(BufWriter::new)?;
474-
self.header.write_bytes(&mut writer)?;
475-
let mut writer = Encoder::new(writer, 3)?.auto_finish();
476-
self.write_range(&mut writer)?;
477-
writer.flush()?;
478-
Ok(())
479-
}
480-
481441
/// Write the index to an output buffer
482442
pub fn write_bytes<W: Write>(&self, writer: &mut W) -> Result<()> {
483443
self.header.write_bytes(writer)?;
@@ -490,9 +450,8 @@ impl BlockIndex {
490450
/// Write the collection of `BlockRange` to an output handle
491451
/// Writes all block ranges to the provided writer
492452
///
493-
/// This method is used internally by `save_to_path` to write the block ranges
494-
/// to an index file. It can also be used to serialize an index to any destination
495-
/// that implements `Write`.
453+
/// This method is used internally to write the block ranges to the embedded index.
454+
/// It can also be used to serialize an index to any destination that implements `Write`.
496455
///
497456
/// # Parameters
498457
///
@@ -524,8 +483,8 @@ impl BlockIndex {
524483
/// Creates a new index by scanning a VBQ file
525484
///
526485
/// This method memory-maps the specified VBQ file and scans it block by block
527-
/// to create an index. The index can then be saved to a file for future use, enabling
528-
/// efficient random access without rescanning the file.
486+
/// to create an index. This is primarily used internally when embedding the index
487+
/// into VBQ files during the write process.
529488
///
530489
/// # Parameters
531490
///
@@ -545,9 +504,6 @@ impl BlockIndex {
545504
/// // Create an index from a VBQ file
546505
/// let index = BlockIndex::from_vbq(Path::new("example.vbq")).unwrap();
547506
///
548-
/// // Save the index for future use
549-
/// index.save_to_path(Path::new("example.vbq.vqi")).unwrap();
550-
///
551507
/// // Get statistics about the file
552508
/// println!("File contains {} blocks", index.n_blocks());
553509
///
@@ -603,45 +559,6 @@ impl BlockIndex {
603559
Ok(index)
604560
}
605561

606-
/// Reads an index from a path
607-
///
608-
/// # Panics
609-
/// Panics if the path is not a valid UTF-8 string.
610-
pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
611-
let Some(upstream_file) = path.as_ref().to_str().unwrap().strip_suffix(".vqi") else {
612-
return Err(IndexError::MissingUpstreamFile(
613-
path.as_ref().to_string_lossy().to_string(),
614-
)
615-
.into());
616-
};
617-
let upstream_handle = File::open(upstream_file)?;
618-
let mmap = unsafe { memmap2::Mmap::map(&upstream_handle)? };
619-
let file_size = mmap.len() as u64;
620-
621-
let mut file_handle = File::open(path).map(BufReader::new)?;
622-
let index_header = IndexHeader::from_reader(&mut file_handle)?;
623-
if index_header.bytes != file_size {
624-
return Err(IndexError::ByteSizeMismatch(file_size, index_header.bytes).into());
625-
}
626-
let buffer = {
627-
let mut buffer = Vec::new();
628-
let mut decoder = Decoder::new(file_handle)?;
629-
decoder.read_to_end(&mut buffer)?;
630-
buffer
631-
};
632-
633-
let mut ranges = Self::new(index_header);
634-
let mut pos = 0;
635-
while pos < buffer.len() {
636-
let bound = pos + SIZE_BLOCK_RANGE;
637-
let range = BlockRange::from_bytes(&buffer[pos..bound]);
638-
ranges.add_range(range);
639-
pos += SIZE_BLOCK_RANGE;
640-
}
641-
642-
Ok(ranges)
643-
}
644-
645562
pub fn from_bytes(bytes: &[u8]) -> Result<Self> {
646563
let index_header = IndexHeader::from_bytes(bytes)?;
647564
let buffer = {
@@ -676,10 +593,11 @@ impl BlockIndex {
676593
/// # Examples
677594
///
678595
/// ```rust,no_run
679-
/// use binseq::vbq::BlockIndex;
596+
/// use binseq::vbq::MmapReader;
680597
/// use std::path::Path;
681598
///
682-
/// let index = BlockIndex::from_path(Path::new("example.vbq.vqi")).unwrap();
599+
/// let reader = MmapReader::new(Path::new("example.vbq")).unwrap();
600+
/// let index = reader.load_index().unwrap();
683601
///
684602
/// // Examine the ranges to determine which blocks to process
685603
/// for (i, range) in index.ranges().iter().enumerate() {

src/vbq/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@
7171
//! ## Recent Format Changes (v0.7.0+)
7272
//!
7373
//! * **Embedded Index**: Index data is now stored within the VBQ file itself, eliminating
74-
//! separate `.vqi` files and improving portability.
74+
//! improving portability.
7575
//! * **Headers Support**: Optional sequence identifiers can be stored with each record.
7676
//! * **Extended Capacity**: u64 indexing supports files with more than 4 billion records.
7777
//! * **Multi-bit Encoding**: Support for both 2-bit and 4-bit nucleotide encodings.

0 commit comments

Comments
 (0)