From e153deddefe6457f4b3ab8cd037c6b39d9a8891f Mon Sep 17 00:00:00 2001 From: Mykhailo Korobkov Date: Sun, 24 May 2026 14:24:27 +0300 Subject: [PATCH] feat(gguf): multi-shard reader for *-NNNNN-of-NNNNN.gguf splits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit llama.cpp's gguf-split produces multi-file GGUFs (canonical naming: `--of-.gguf`). Each shard carries the full metadata header but only owns its own slice of tensors. The current `GgufFile::open` reads one file, so multi-shard models — Kimi K2.6 (14 shards), DeepSeek-V4-Flash (3 shards), and increasingly any large modern LLM — could not be loaded for vindex extraction. This change: 1. Adds `ShardInfo` (path + data_offset) and a `shards: Vec` field on `GgufFile`. Single-file GGUFs get a `shards.len() == 1`. 2. `GgufFile::open` detects multi-shard via the explicit `split.count` metadata key, falling back to the filename pattern when the splitter omits the metadata. 3. Discovers all sibling shards in the same directory by reconstructing filenames at the prefix's chosen width (`00001-of-00014` vs `001-of-003` both supported). 4. Appends each sibling's `tensor_infos` to the combined list, tagging them with the right `shard_idx`. Cross-checks the total against `split.tensors.count` when present. 5. `load_tensors_filtered` mmaps each shard lazily on first use and reads each tensor from `shards[info.shard_idx].path` at the right per-shard `data_offset`. Shards whose tensors are all skipped by `skip_key` are never opened. Backward-compatible: existing `GgufFile::open` callers and the single-file test fixtures keep working with `shards = vec![…one…]`. Tests (8 new + all existing pass): - parse_shard_filename: canonical layout, plain `.gguf` rejection, mismatched widths rejection, 3-digit split width support - discover_shard_siblings: complete set discovery from any-position shard, error when sibling missing - open_multi_shard_combines_tensors_from_all_shards: builds two real 2-shard GGUFs with disjoint tensor sets, opens via either shard, verifies each tensor reads from its own shard's data section - open_rejects_multi_shard_when_a_shard_file_is_missing - existing 27 tests stay green; 286/286 larql-models tests pass Combined with #96 (MLA absorption), #103 (Q3_K/Q5_K dequant), #133 (GGUF extract input), and #135 (DeepSeek-V2/V3 MLA metadata reading), this completes the chain — `larql extract --level inference` works end-to-end on Kimi K2.6 UD-Q8_K_XL and DeepSeek-V4-Flash multi-shard GGUFs. --- crates/larql-models/src/loading/gguf.rs | 452 +++++++++++++++++++++++- 1 file changed, 447 insertions(+), 5 deletions(-) diff --git a/crates/larql-models/src/loading/gguf.rs b/crates/larql-models/src/loading/gguf.rs index cfcd124b4..781db9435 100644 --- a/crates/larql-models/src/loading/gguf.rs +++ b/crates/larql-models/src/loading/gguf.rs @@ -139,6 +139,18 @@ pub struct GgufTensorInfo { dims: Vec, tensor_type: u32, offset: u64, + /// Index into [`GgufFile::shards`] selecting which file this tensor lives in. + /// Zero for single-shard models; assigned by `open` when discovering siblings. + shard_idx: usize, +} + +/// One file in a (possibly multi-shard) GGUF split. +#[derive(Debug, Clone)] +pub struct ShardInfo { + /// Path to the `.gguf` file for this shard. + pub path: std::path::PathBuf, + /// Byte offset at which tensor data starts inside this file. + pub data_offset: u64, } // ═══════════════════════════════════════════════════════════════ @@ -148,13 +160,229 @@ pub struct GgufTensorInfo { pub struct GgufFile { pub metadata: HashMap, pub tensor_infos: Vec, + /// Tensor data offset of the first (or only) shard. Kept for back-compat + /// with single-file callers — multi-shard callers should index into + /// [`Self::shards`] using `GgufTensorInfo::shard_idx`. pub data_offset: u64, + /// Path to the first (or only) shard. Same back-compat note as + /// `data_offset` — for multi-shard models the other shards are in + /// [`Self::shards`]. pub path: std::path::PathBuf, + /// All shards making up this GGUF. Always non-empty; length 1 for + /// single-file models. `shards[0].path == self.path` and + /// `shards[0].data_offset == self.data_offset` always hold. + pub shards: Vec, +} + +/// Parse a multi-shard GGUF filename of the form +/// `--of-.gguf` (canonical llama.cpp split layout) +/// and return `(prefix_without_dashes, this_shard_idx_0based, total_shards)`. +/// +/// Returns `None` for filenames that don't match the pattern (i.e. single +/// files); the caller treats those as single-shard GGUFs. +pub(crate) fn parse_shard_filename(path: &Path) -> Option<(String, usize, usize)> { + let name = path.file_name()?.to_str()?; + let stem = name.strip_suffix(".gguf")?; + // Tail must be `-NNNNN-of-NNNNN` with matching widths. + // Rightmost run of digits = "NNNNN" (total shard count). + let count_start = stem + .rfind(|c: char| !c.is_ascii_digit()) + .map(|i| i + 1) + .unwrap_or(0); + if count_start >= stem.len() { + return None; // no trailing digits at all + } + let count_str = &stem[count_start..]; + let before_count = &stem[..count_start]; // "-NNNNN-of-" + let before_of = before_count.strip_suffix("-of-")?; + // Then second rightmost digits run = "NNNNN" (this shard's 1-based index). + let idx_start = before_of + .rfind(|c: char| !c.is_ascii_digit()) + .map(|i| i + 1) + .unwrap_or(0); + if idx_start >= before_of.len() { + return None; + } + let idx_str = &before_of[idx_start..]; + let prefix = before_of[..idx_start].strip_suffix('-')?; + + let this_idx_1based: usize = idx_str.parse().ok()?; + let total: usize = count_str.parse().ok()?; + if this_idx_1based == 0 || this_idx_1based > total { + return None; + } + // Width must match across the two numbers (llama.cpp convention). + if idx_str.len() != count_str.len() { + return None; + } + Some((prefix.to_string(), this_idx_1based - 1, total)) +} + +/// Discover the full set of sibling shards making up a multi-shard GGUF. +/// `path` is one shard the user pointed at; the returned vec is ordered by +/// shard index (shard 1 first → shard N last) and is guaranteed to be of +/// length `expected_total`. +pub(crate) fn discover_shard_siblings( + parent: &Path, + path: &Path, + expected_total: usize, +) -> Result, ModelError> { + let (prefix, _, total_from_name) = parse_shard_filename(path).ok_or_else(|| { + ModelError::Parse(format!( + "multi-shard GGUF without canonical -NNNNN-of-NNNNN filename: {}", + path.display() + )) + })?; + if expected_total != total_from_name { + return Err(ModelError::Parse(format!( + "shard total mismatch: split.count={expected_total} but filename says of-{total_from_name}", + ))); + } + // Detect the width used in the filename so we reconstruct sibling + // names byte-for-byte (00001 vs 1). + let name_str = path.file_name().and_then(|n| n.to_str()).unwrap_or(""); + let width = name_str + .strip_suffix(".gguf") + .and_then(|s| s.strip_suffix(&format!("-of-{expected_total:0>5}"))) + .and_then(|s| s.rsplit('-').next()) + .map(|n| n.len()) + .unwrap_or(5); + let total_width = name_str + .strip_suffix(".gguf") + .and_then(|s| s.rsplit("-of-").next()) + .map(|n| n.len()) + .unwrap_or(5); + + let mut paths = Vec::with_capacity(expected_total); + for i in 1..=expected_total { + let fname = format!( + "{prefix}-{i:0>idx_width$}-of-{total:0>tot_width$}.gguf", + prefix = prefix, + i = i, + idx_width = width, + total = expected_total, + tot_width = total_width, + ); + let p = parent.join(&fname); + if !p.exists() { + return Err(ModelError::Parse(format!( + "multi-shard GGUF missing expected sibling: {} (looking for shard {} of {})", + p.display(), + i, + expected_total, + ))); + } + paths.push(p); + } + Ok(paths) } impl GgufFile { /// Parse a GGUF file header and tensor info (does not read tensor data yet). + /// + /// Detects multi-shard splits by checking the `split.count` GGUF metadata + /// key on the file you point at; when `split.count > 1` (or the filename + /// matches the canonical `*-NNNNN-of-NNNNN.gguf` pattern), sibling shards + /// in the same directory are also discovered and their tensor infos are + /// merged into the returned `GgufFile`. Tensors carry a `shard_idx` + /// internally so [`Self::load_tensors_filtered`] reads each from the + /// right shard. pub fn open(path: &Path) -> Result { + let mut gguf = Self::open_single(path)?; + + // Multi-shard detection: prefer the explicit `split.*` metadata + // emitted by llama-gguf-split, fall back to the filename pattern + // (some splitters skip the metadata). + let split_count = gguf + .metadata + .get("split.count") + .and_then(|v| v.as_u32()) + .unwrap_or(0); + let pattern_count = parse_shard_filename(path).map(|(_, _, total)| total); + let total_shards = match (split_count, pattern_count) { + (n, _) if n > 1 => n as usize, + (_, Some(n)) if n > 1 => n, + _ => return Ok(gguf), // single-file + }; + + // We need every shard in the split — find them all. + let parent = path.parent().ok_or_else(|| { + ModelError::Parse(format!("GGUF path has no parent: {}", path.display())) + })?; + let shard_paths = discover_shard_siblings(parent, path, total_shards)?; + debug_assert_eq!(shard_paths.len(), total_shards); + + // The first entry is the shard we already loaded (whichever the + // caller pointed at). Rewrite `gguf` to be anchored at shard 0 and + // then accumulate the remaining shards' tensor infos. + let this_idx = shard_paths + .iter() + .position(|p| p == path) + .ok_or_else(|| ModelError::Parse(format!( + "passed shard {} not found in discovered set", path.display() + )))?; + let mut shards: Vec = Vec::with_capacity(total_shards); + let mut combined_infos: Vec = Vec::new(); + for (idx, shard_path) in shard_paths.iter().enumerate() { + if idx == this_idx { + shards.push(ShardInfo { + path: path.to_path_buf(), + data_offset: gguf.data_offset, + }); + for info in &gguf.tensor_infos { + let mut clone = GgufTensorInfo { + name: info.name.clone(), + n_dims: info.n_dims, + dims: info.dims.clone(), + tensor_type: info.tensor_type, + offset: info.offset, + shard_idx: idx, + }; + clone.shard_idx = idx; + combined_infos.push(clone); + } + } else { + let other = Self::open_single(shard_path)?; + shards.push(ShardInfo { + path: shard_path.clone(), + data_offset: other.data_offset, + }); + for mut info in other.tensor_infos { + info.shard_idx = idx; + combined_infos.push(info); + } + } + } + + // Sanity check: total tensor count should match split.tensors.count + // when that key is emitted (llama-gguf-split always writes it). + if let Some(expected) = gguf + .metadata + .get("split.tensors.count") + .and_then(|v| v.as_u32()) + { + if combined_infos.len() != expected as usize { + return Err(ModelError::Parse(format!( + "multi-shard tensor count mismatch: combined {} shards yielded \ + {} tensors, but split.tensors.count = {}", + total_shards, + combined_infos.len(), + expected + ))); + } + } + + gguf.tensor_infos = combined_infos; + gguf.shards = shards; + // `gguf.path` / `gguf.data_offset` keep pointing at the + // user-supplied shard for back-compat with diagnostics; the + // multi-shard loader uses `shards[info.shard_idx]` internally. + Ok(gguf) + } + + /// Open a single GGUF file without multi-shard discovery. Used as the + /// per-shard primitive by [`Self::open`]. + fn open_single(path: &Path) -> Result { let file = std::fs::File::open(path)?; let mut r = BufReader::new(file); @@ -203,6 +431,7 @@ impl GgufFile { dims, tensor_type, offset, + shard_idx: 0, }); } @@ -216,6 +445,10 @@ impl GgufFile { tensor_infos, data_offset, path: path.to_path_buf(), + shards: vec![ShardInfo { + path: path.to_path_buf(), + data_offset, + }], }) } @@ -238,6 +471,10 @@ impl GgufFile { /// `skip_key` sees keys after GGUF-to-HF normalization but before architecture-specific /// prefix stripping. GGUF keys do not carry the HF wrapper prefixes, so this is enough for /// the current GGUF path and lets walk-only loading avoid FFN dequantization. + /// + /// Multi-shard models: tensors are read from `self.shards[info.shard_idx]`, + /// which is mmap'd lazily on first use within this call. Shards that + /// contain no surviving tensors after `skip_key` are not mmap'd at all. #[allow(clippy::type_complexity)] pub fn load_tensors_filtered( &self, @@ -249,8 +486,11 @@ impl GgufFile { ), ModelError, > { - let file = std::fs::File::open(&self.path)?; - let mmap = unsafe { memmap2::Mmap::map(&file)? }; + // Lazy mmap of every shard — Option avoids paying the open cost + // for shards that turn out to contain only skipped tensors. + let mut shard_mmaps: Vec> = (0..self.shards.len()) + .map(|_| None) + .collect(); let mut tensors = HashMap::new(); let mut vectors = HashMap::new(); @@ -263,10 +503,20 @@ impl GgufFile { continue; } - let abs_offset = self.data_offset.checked_add(info.offset).ok_or_else(|| { + let shard = &self.shards[info.shard_idx]; + if shard_mmaps[info.shard_idx].is_none() { + let f = std::fs::File::open(&shard.path)?; + let m = unsafe { memmap2::Mmap::map(&f)? }; + shard_mmaps[info.shard_idx] = Some(m); + } + let mmap = shard_mmaps[info.shard_idx] + .as_ref() + .expect("mmap initialised above"); + + let abs_offset = shard.data_offset.checked_add(info.offset).ok_or_else(|| { ModelError::Parse(format!( "tensor {}: data_offset {} + tensor offset {} overflows u64", - info.name, self.data_offset, info.offset, + info.name, shard.data_offset, info.offset, )) })?; let n_elements: u64 = info.dims.iter().product(); @@ -286,10 +536,11 @@ impl GgufFile { })?; if end > mmap.len() { return Err(ModelError::Parse(format!( - "tensor {} data out of bounds (offset {} + size {} > file {})", + "tensor {} data out of bounds (offset {} + size {} > shard {} file {})", info.name, abs_offset, data_size, + info.shard_idx, mmap.len() ))); } @@ -1307,6 +1558,7 @@ mod tests { tensor_infos: Vec::new(), data_offset: 0, path: std::path::PathBuf::from(""), + shards: vec![ShardInfo { path: std::path::PathBuf::from(""), data_offset: 0 }], }; let cfg = gguf.to_config_json(); @@ -1350,6 +1602,7 @@ mod tests { tensor_infos: Vec::new(), data_offset: 0, path: std::path::PathBuf::from(""), + shards: vec![ShardInfo { path: std::path::PathBuf::from(""), data_offset: 0 }], }; let cfg = gguf.to_config_json(); @@ -1358,6 +1611,195 @@ mod tests { assert_eq!(arch.config().rope_base, 10_000.0); } + #[test] + fn parse_shard_filename_canonical_layout() { + let p = std::path::PathBuf::from( + "/x/Kimi-K2.6-UD-Q8_K_XL-00003-of-00014.gguf", + ); + let (prefix, idx, total) = parse_shard_filename(&p).unwrap(); + assert_eq!(prefix, "Kimi-K2.6-UD-Q8_K_XL"); + assert_eq!(idx, 2); + assert_eq!(total, 14); + } + + #[test] + fn parse_shard_filename_rejects_single_file() { + let p = std::path::PathBuf::from("/x/llama-3.1-8b-q4.gguf"); + assert!(parse_shard_filename(&p).is_none()); + } + + #[test] + fn parse_shard_filename_rejects_unmatched_widths() { + let p = std::path::PathBuf::from("/x/foo-00003-of-0014.gguf"); + assert!(parse_shard_filename(&p).is_none()); + } + + #[test] + fn parse_shard_filename_supports_3digit_split() { + let p = std::path::PathBuf::from("/x/foo-001-of-003.gguf"); + let (prefix, idx, total) = parse_shard_filename(&p).unwrap(); + assert_eq!(prefix, "foo"); + assert_eq!(idx, 0); + assert_eq!(total, 3); + } + + #[test] + fn discover_shard_siblings_finds_all_in_order() { + let dir = tempfile::tempdir().unwrap(); + for i in 1..=3 { + std::fs::File::create( + dir.path().join(format!("model-{i:0>5}-of-00003.gguf")), + ) + .unwrap(); + } + let middle = dir.path().join("model-00002-of-00003.gguf"); + let paths = discover_shard_siblings(dir.path(), &middle, 3).unwrap(); + assert_eq!(paths.len(), 3); + assert!(paths[0].ends_with("model-00001-of-00003.gguf")); + assert!(paths[1].ends_with("model-00002-of-00003.gguf")); + assert!(paths[2].ends_with("model-00003-of-00003.gguf")); + } + + #[test] + fn discover_shard_siblings_errors_when_one_missing() { + let dir = tempfile::tempdir().unwrap(); + for i in [1usize, 3] { + std::fs::File::create( + dir.path().join(format!("m-{i:0>5}-of-00003.gguf")), + ) + .unwrap(); + } + let first = dir.path().join("m-00001-of-00003.gguf"); + let err = discover_shard_siblings(dir.path(), &first, 3).unwrap_err(); + assert!( + format!("{err}").contains("missing expected sibling"), + "unexpected error: {err}" + ); + } + + /// End-to-end multi-shard open: two real GGUF files with different + /// tensors in each, joined via canonical `-NNNNN-of-00002.gguf` layout. + /// Verifies discovery, shard_idx assignment, and per-shard tensor + /// reads via `load_tensors`. + #[test] + fn open_multi_shard_combines_tensors_from_all_shards() { + use std::io::{Seek, Write}; + + let dir = tempfile::tempdir().unwrap(); + + let write_shard = |idx: usize, + tensor_ids: &[usize], + metas: &[(&str, u32)]| + -> std::path::PathBuf { + let path = dir.path().join(format!("m-{idx:0>5}-of-00002.gguf")); + let mut file = std::fs::File::create(&path).unwrap(); + file.write_all(&GGUF_MAGIC.to_le_bytes()).unwrap(); + file.write_all(&3u32.to_le_bytes()).unwrap(); + file.write_all(&(tensor_ids.len() as u64).to_le_bytes()).unwrap(); + file.write_all(&(metas.len() as u64).to_le_bytes()).unwrap(); + + for (k, v) in metas { + let kb = k.as_bytes(); + file.write_all(&(kb.len() as u64).to_le_bytes()).unwrap(); + file.write_all(kb).unwrap(); + file.write_all(&4u32.to_le_bytes()).unwrap(); // u32 type tag + file.write_all(&v.to_le_bytes()).unwrap(); + } + + for (rel, &tid) in tensor_ids.iter().enumerate() { + let name = format!("blk.{tid}.ffn_down.weight"); + let nb = name.as_bytes(); + file.write_all(&(nb.len() as u64).to_le_bytes()).unwrap(); + file.write_all(nb).unwrap(); + file.write_all(&2u32.to_le_bytes()).unwrap(); + file.write_all(&2u64.to_le_bytes()).unwrap(); + file.write_all(&2u64.to_le_bytes()).unwrap(); + file.write_all(&crate::quant::ggml::TYPE_F32.to_le_bytes()) + .unwrap(); + let off = (rel as u64) * 16; + file.write_all(&off.to_le_bytes()).unwrap(); + } + + let pos = file.stream_position().unwrap(); + let aligned = pos.div_ceil(32) * 32; + file.write_all(&vec![0u8; (aligned - pos) as usize]) + .unwrap(); + + for &tid in tensor_ids { + for off in 0..4 { + file.write_all( + &((tid as f32) + 0.1 * off as f32).to_le_bytes(), + ) + .unwrap(); + } + } + file.flush().unwrap(); + path + }; + + let p1 = write_shard( + 1, + &[0, 1], + &[("split.no", 0), ("split.count", 2), ("split.tensors.count", 4)], + ); + let _p2 = write_shard( + 2, + &[2, 3], + &[("split.no", 1), ("split.count", 2), ("split.tensors.count", 4)], + ); + + let gguf = GgufFile::open(&p1).unwrap(); + assert_eq!(gguf.shards.len(), 2); + assert_eq!(gguf.tensor_infos.len(), 4); + for (i, info) in gguf.tensor_infos.iter().enumerate() { + let expected = if i < 2 { 0 } else { 1 }; + assert_eq!( + info.shard_idx, expected, + "tensor {i} ({}) shard mismatch", + info.name + ); + } + + let (tensors, _) = gguf.load_tensors().unwrap(); + assert_eq!(tensors.len(), 4); + for tid in 0..4 { + let key = format!("layers.{tid}.mlp.down_proj.weight"); + let arr = tensors.get(&key).unwrap_or_else(|| panic!("missing {key}")); + assert!( + (arr[[0, 0]] - tid as f32).abs() < 1e-6, + "tensor {tid} top-left {} != {tid}", + arr[[0, 0]] + ); + } + } + + #[test] + fn open_rejects_multi_shard_when_a_shard_file_is_missing() { + use std::io::Write; + let dir = tempfile::tempdir().unwrap(); + let p = dir.path().join("m-00001-of-00002.gguf"); + let mut file = std::fs::File::create(&p).unwrap(); + file.write_all(&GGUF_MAGIC.to_le_bytes()).unwrap(); + file.write_all(&3u32.to_le_bytes()).unwrap(); + file.write_all(&0u64.to_le_bytes()).unwrap(); + file.write_all(&1u64.to_le_bytes()).unwrap(); + let k = "split.count".as_bytes(); + file.write_all(&(k.len() as u64).to_le_bytes()).unwrap(); + file.write_all(k).unwrap(); + file.write_all(&4u32.to_le_bytes()).unwrap(); + file.write_all(&2u32.to_le_bytes()).unwrap(); + file.flush().unwrap(); + + let err = match GgufFile::open(&p) { + Ok(_) => panic!("expected error for missing sibling shard"), + Err(e) => e, + }; + assert!( + format!("{err}").contains("missing expected sibling"), + "unexpected error: {err}" + ); + } + /// Build a minimal GGUF file with one 2-D F32 tensor, but truncate the /// tensor data region so that `offset + size > file len`. Loader must /// reject this cleanly, not panic on a slice OOB.