diff --git a/crates/larql-models/src/loading/gguf.rs b/crates/larql-models/src/loading/gguf.rs index cfcd124b4..781db9435 100644 --- a/crates/larql-models/src/loading/gguf.rs +++ b/crates/larql-models/src/loading/gguf.rs @@ -139,6 +139,18 @@ pub struct GgufTensorInfo { dims: Vec, tensor_type: u32, offset: u64, + /// Index into [`GgufFile::shards`] selecting which file this tensor lives in. + /// Zero for single-shard models; assigned by `open` when discovering siblings. + shard_idx: usize, +} + +/// One file in a (possibly multi-shard) GGUF split. +#[derive(Debug, Clone)] +pub struct ShardInfo { + /// Path to the `.gguf` file for this shard. + pub path: std::path::PathBuf, + /// Byte offset at which tensor data starts inside this file. + pub data_offset: u64, } // ═══════════════════════════════════════════════════════════════ @@ -148,13 +160,229 @@ pub struct GgufTensorInfo { pub struct GgufFile { pub metadata: HashMap, pub tensor_infos: Vec, + /// Tensor data offset of the first (or only) shard. Kept for back-compat + /// with single-file callers — multi-shard callers should index into + /// [`Self::shards`] using `GgufTensorInfo::shard_idx`. pub data_offset: u64, + /// Path to the first (or only) shard. Same back-compat note as + /// `data_offset` — for multi-shard models the other shards are in + /// [`Self::shards`]. pub path: std::path::PathBuf, + /// All shards making up this GGUF. Always non-empty; length 1 for + /// single-file models. `shards[0].path == self.path` and + /// `shards[0].data_offset == self.data_offset` always hold. + pub shards: Vec, +} + +/// Parse a multi-shard GGUF filename of the form +/// `--of-.gguf` (canonical llama.cpp split layout) +/// and return `(prefix_without_dashes, this_shard_idx_0based, total_shards)`. +/// +/// Returns `None` for filenames that don't match the pattern (i.e. single +/// files); the caller treats those as single-shard GGUFs. +pub(crate) fn parse_shard_filename(path: &Path) -> Option<(String, usize, usize)> { + let name = path.file_name()?.to_str()?; + let stem = name.strip_suffix(".gguf")?; + // Tail must be `-NNNNN-of-NNNNN` with matching widths. + // Rightmost run of digits = "NNNNN" (total shard count). + let count_start = stem + .rfind(|c: char| !c.is_ascii_digit()) + .map(|i| i + 1) + .unwrap_or(0); + if count_start >= stem.len() { + return None; // no trailing digits at all + } + let count_str = &stem[count_start..]; + let before_count = &stem[..count_start]; // "-NNNNN-of-" + let before_of = before_count.strip_suffix("-of-")?; + // Then second rightmost digits run = "NNNNN" (this shard's 1-based index). + let idx_start = before_of + .rfind(|c: char| !c.is_ascii_digit()) + .map(|i| i + 1) + .unwrap_or(0); + if idx_start >= before_of.len() { + return None; + } + let idx_str = &before_of[idx_start..]; + let prefix = before_of[..idx_start].strip_suffix('-')?; + + let this_idx_1based: usize = idx_str.parse().ok()?; + let total: usize = count_str.parse().ok()?; + if this_idx_1based == 0 || this_idx_1based > total { + return None; + } + // Width must match across the two numbers (llama.cpp convention). + if idx_str.len() != count_str.len() { + return None; + } + Some((prefix.to_string(), this_idx_1based - 1, total)) +} + +/// Discover the full set of sibling shards making up a multi-shard GGUF. +/// `path` is one shard the user pointed at; the returned vec is ordered by +/// shard index (shard 1 first → shard N last) and is guaranteed to be of +/// length `expected_total`. +pub(crate) fn discover_shard_siblings( + parent: &Path, + path: &Path, + expected_total: usize, +) -> Result, ModelError> { + let (prefix, _, total_from_name) = parse_shard_filename(path).ok_or_else(|| { + ModelError::Parse(format!( + "multi-shard GGUF without canonical -NNNNN-of-NNNNN filename: {}", + path.display() + )) + })?; + if expected_total != total_from_name { + return Err(ModelError::Parse(format!( + "shard total mismatch: split.count={expected_total} but filename says of-{total_from_name}", + ))); + } + // Detect the width used in the filename so we reconstruct sibling + // names byte-for-byte (00001 vs 1). + let name_str = path.file_name().and_then(|n| n.to_str()).unwrap_or(""); + let width = name_str + .strip_suffix(".gguf") + .and_then(|s| s.strip_suffix(&format!("-of-{expected_total:0>5}"))) + .and_then(|s| s.rsplit('-').next()) + .map(|n| n.len()) + .unwrap_or(5); + let total_width = name_str + .strip_suffix(".gguf") + .and_then(|s| s.rsplit("-of-").next()) + .map(|n| n.len()) + .unwrap_or(5); + + let mut paths = Vec::with_capacity(expected_total); + for i in 1..=expected_total { + let fname = format!( + "{prefix}-{i:0>idx_width$}-of-{total:0>tot_width$}.gguf", + prefix = prefix, + i = i, + idx_width = width, + total = expected_total, + tot_width = total_width, + ); + let p = parent.join(&fname); + if !p.exists() { + return Err(ModelError::Parse(format!( + "multi-shard GGUF missing expected sibling: {} (looking for shard {} of {})", + p.display(), + i, + expected_total, + ))); + } + paths.push(p); + } + Ok(paths) } impl GgufFile { /// Parse a GGUF file header and tensor info (does not read tensor data yet). + /// + /// Detects multi-shard splits by checking the `split.count` GGUF metadata + /// key on the file you point at; when `split.count > 1` (or the filename + /// matches the canonical `*-NNNNN-of-NNNNN.gguf` pattern), sibling shards + /// in the same directory are also discovered and their tensor infos are + /// merged into the returned `GgufFile`. Tensors carry a `shard_idx` + /// internally so [`Self::load_tensors_filtered`] reads each from the + /// right shard. pub fn open(path: &Path) -> Result { + let mut gguf = Self::open_single(path)?; + + // Multi-shard detection: prefer the explicit `split.*` metadata + // emitted by llama-gguf-split, fall back to the filename pattern + // (some splitters skip the metadata). + let split_count = gguf + .metadata + .get("split.count") + .and_then(|v| v.as_u32()) + .unwrap_or(0); + let pattern_count = parse_shard_filename(path).map(|(_, _, total)| total); + let total_shards = match (split_count, pattern_count) { + (n, _) if n > 1 => n as usize, + (_, Some(n)) if n > 1 => n, + _ => return Ok(gguf), // single-file + }; + + // We need every shard in the split — find them all. + let parent = path.parent().ok_or_else(|| { + ModelError::Parse(format!("GGUF path has no parent: {}", path.display())) + })?; + let shard_paths = discover_shard_siblings(parent, path, total_shards)?; + debug_assert_eq!(shard_paths.len(), total_shards); + + // The first entry is the shard we already loaded (whichever the + // caller pointed at). Rewrite `gguf` to be anchored at shard 0 and + // then accumulate the remaining shards' tensor infos. + let this_idx = shard_paths + .iter() + .position(|p| p == path) + .ok_or_else(|| ModelError::Parse(format!( + "passed shard {} not found in discovered set", path.display() + )))?; + let mut shards: Vec = Vec::with_capacity(total_shards); + let mut combined_infos: Vec = Vec::new(); + for (idx, shard_path) in shard_paths.iter().enumerate() { + if idx == this_idx { + shards.push(ShardInfo { + path: path.to_path_buf(), + data_offset: gguf.data_offset, + }); + for info in &gguf.tensor_infos { + let mut clone = GgufTensorInfo { + name: info.name.clone(), + n_dims: info.n_dims, + dims: info.dims.clone(), + tensor_type: info.tensor_type, + offset: info.offset, + shard_idx: idx, + }; + clone.shard_idx = idx; + combined_infos.push(clone); + } + } else { + let other = Self::open_single(shard_path)?; + shards.push(ShardInfo { + path: shard_path.clone(), + data_offset: other.data_offset, + }); + for mut info in other.tensor_infos { + info.shard_idx = idx; + combined_infos.push(info); + } + } + } + + // Sanity check: total tensor count should match split.tensors.count + // when that key is emitted (llama-gguf-split always writes it). + if let Some(expected) = gguf + .metadata + .get("split.tensors.count") + .and_then(|v| v.as_u32()) + { + if combined_infos.len() != expected as usize { + return Err(ModelError::Parse(format!( + "multi-shard tensor count mismatch: combined {} shards yielded \ + {} tensors, but split.tensors.count = {}", + total_shards, + combined_infos.len(), + expected + ))); + } + } + + gguf.tensor_infos = combined_infos; + gguf.shards = shards; + // `gguf.path` / `gguf.data_offset` keep pointing at the + // user-supplied shard for back-compat with diagnostics; the + // multi-shard loader uses `shards[info.shard_idx]` internally. + Ok(gguf) + } + + /// Open a single GGUF file without multi-shard discovery. Used as the + /// per-shard primitive by [`Self::open`]. + fn open_single(path: &Path) -> Result { let file = std::fs::File::open(path)?; let mut r = BufReader::new(file); @@ -203,6 +431,7 @@ impl GgufFile { dims, tensor_type, offset, + shard_idx: 0, }); } @@ -216,6 +445,10 @@ impl GgufFile { tensor_infos, data_offset, path: path.to_path_buf(), + shards: vec![ShardInfo { + path: path.to_path_buf(), + data_offset, + }], }) } @@ -238,6 +471,10 @@ impl GgufFile { /// `skip_key` sees keys after GGUF-to-HF normalization but before architecture-specific /// prefix stripping. GGUF keys do not carry the HF wrapper prefixes, so this is enough for /// the current GGUF path and lets walk-only loading avoid FFN dequantization. + /// + /// Multi-shard models: tensors are read from `self.shards[info.shard_idx]`, + /// which is mmap'd lazily on first use within this call. Shards that + /// contain no surviving tensors after `skip_key` are not mmap'd at all. #[allow(clippy::type_complexity)] pub fn load_tensors_filtered( &self, @@ -249,8 +486,11 @@ impl GgufFile { ), ModelError, > { - let file = std::fs::File::open(&self.path)?; - let mmap = unsafe { memmap2::Mmap::map(&file)? }; + // Lazy mmap of every shard — Option avoids paying the open cost + // for shards that turn out to contain only skipped tensors. + let mut shard_mmaps: Vec> = (0..self.shards.len()) + .map(|_| None) + .collect(); let mut tensors = HashMap::new(); let mut vectors = HashMap::new(); @@ -263,10 +503,20 @@ impl GgufFile { continue; } - let abs_offset = self.data_offset.checked_add(info.offset).ok_or_else(|| { + let shard = &self.shards[info.shard_idx]; + if shard_mmaps[info.shard_idx].is_none() { + let f = std::fs::File::open(&shard.path)?; + let m = unsafe { memmap2::Mmap::map(&f)? }; + shard_mmaps[info.shard_idx] = Some(m); + } + let mmap = shard_mmaps[info.shard_idx] + .as_ref() + .expect("mmap initialised above"); + + let abs_offset = shard.data_offset.checked_add(info.offset).ok_or_else(|| { ModelError::Parse(format!( "tensor {}: data_offset {} + tensor offset {} overflows u64", - info.name, self.data_offset, info.offset, + info.name, shard.data_offset, info.offset, )) })?; let n_elements: u64 = info.dims.iter().product(); @@ -286,10 +536,11 @@ impl GgufFile { })?; if end > mmap.len() { return Err(ModelError::Parse(format!( - "tensor {} data out of bounds (offset {} + size {} > file {})", + "tensor {} data out of bounds (offset {} + size {} > shard {} file {})", info.name, abs_offset, data_size, + info.shard_idx, mmap.len() ))); } @@ -1307,6 +1558,7 @@ mod tests { tensor_infos: Vec::new(), data_offset: 0, path: std::path::PathBuf::from(""), + shards: vec![ShardInfo { path: std::path::PathBuf::from(""), data_offset: 0 }], }; let cfg = gguf.to_config_json(); @@ -1350,6 +1602,7 @@ mod tests { tensor_infos: Vec::new(), data_offset: 0, path: std::path::PathBuf::from(""), + shards: vec![ShardInfo { path: std::path::PathBuf::from(""), data_offset: 0 }], }; let cfg = gguf.to_config_json(); @@ -1358,6 +1611,195 @@ mod tests { assert_eq!(arch.config().rope_base, 10_000.0); } + #[test] + fn parse_shard_filename_canonical_layout() { + let p = std::path::PathBuf::from( + "/x/Kimi-K2.6-UD-Q8_K_XL-00003-of-00014.gguf", + ); + let (prefix, idx, total) = parse_shard_filename(&p).unwrap(); + assert_eq!(prefix, "Kimi-K2.6-UD-Q8_K_XL"); + assert_eq!(idx, 2); + assert_eq!(total, 14); + } + + #[test] + fn parse_shard_filename_rejects_single_file() { + let p = std::path::PathBuf::from("/x/llama-3.1-8b-q4.gguf"); + assert!(parse_shard_filename(&p).is_none()); + } + + #[test] + fn parse_shard_filename_rejects_unmatched_widths() { + let p = std::path::PathBuf::from("/x/foo-00003-of-0014.gguf"); + assert!(parse_shard_filename(&p).is_none()); + } + + #[test] + fn parse_shard_filename_supports_3digit_split() { + let p = std::path::PathBuf::from("/x/foo-001-of-003.gguf"); + let (prefix, idx, total) = parse_shard_filename(&p).unwrap(); + assert_eq!(prefix, "foo"); + assert_eq!(idx, 0); + assert_eq!(total, 3); + } + + #[test] + fn discover_shard_siblings_finds_all_in_order() { + let dir = tempfile::tempdir().unwrap(); + for i in 1..=3 { + std::fs::File::create( + dir.path().join(format!("model-{i:0>5}-of-00003.gguf")), + ) + .unwrap(); + } + let middle = dir.path().join("model-00002-of-00003.gguf"); + let paths = discover_shard_siblings(dir.path(), &middle, 3).unwrap(); + assert_eq!(paths.len(), 3); + assert!(paths[0].ends_with("model-00001-of-00003.gguf")); + assert!(paths[1].ends_with("model-00002-of-00003.gguf")); + assert!(paths[2].ends_with("model-00003-of-00003.gguf")); + } + + #[test] + fn discover_shard_siblings_errors_when_one_missing() { + let dir = tempfile::tempdir().unwrap(); + for i in [1usize, 3] { + std::fs::File::create( + dir.path().join(format!("m-{i:0>5}-of-00003.gguf")), + ) + .unwrap(); + } + let first = dir.path().join("m-00001-of-00003.gguf"); + let err = discover_shard_siblings(dir.path(), &first, 3).unwrap_err(); + assert!( + format!("{err}").contains("missing expected sibling"), + "unexpected error: {err}" + ); + } + + /// End-to-end multi-shard open: two real GGUF files with different + /// tensors in each, joined via canonical `-NNNNN-of-00002.gguf` layout. + /// Verifies discovery, shard_idx assignment, and per-shard tensor + /// reads via `load_tensors`. + #[test] + fn open_multi_shard_combines_tensors_from_all_shards() { + use std::io::{Seek, Write}; + + let dir = tempfile::tempdir().unwrap(); + + let write_shard = |idx: usize, + tensor_ids: &[usize], + metas: &[(&str, u32)]| + -> std::path::PathBuf { + let path = dir.path().join(format!("m-{idx:0>5}-of-00002.gguf")); + let mut file = std::fs::File::create(&path).unwrap(); + file.write_all(&GGUF_MAGIC.to_le_bytes()).unwrap(); + file.write_all(&3u32.to_le_bytes()).unwrap(); + file.write_all(&(tensor_ids.len() as u64).to_le_bytes()).unwrap(); + file.write_all(&(metas.len() as u64).to_le_bytes()).unwrap(); + + for (k, v) in metas { + let kb = k.as_bytes(); + file.write_all(&(kb.len() as u64).to_le_bytes()).unwrap(); + file.write_all(kb).unwrap(); + file.write_all(&4u32.to_le_bytes()).unwrap(); // u32 type tag + file.write_all(&v.to_le_bytes()).unwrap(); + } + + for (rel, &tid) in tensor_ids.iter().enumerate() { + let name = format!("blk.{tid}.ffn_down.weight"); + let nb = name.as_bytes(); + file.write_all(&(nb.len() as u64).to_le_bytes()).unwrap(); + file.write_all(nb).unwrap(); + file.write_all(&2u32.to_le_bytes()).unwrap(); + file.write_all(&2u64.to_le_bytes()).unwrap(); + file.write_all(&2u64.to_le_bytes()).unwrap(); + file.write_all(&crate::quant::ggml::TYPE_F32.to_le_bytes()) + .unwrap(); + let off = (rel as u64) * 16; + file.write_all(&off.to_le_bytes()).unwrap(); + } + + let pos = file.stream_position().unwrap(); + let aligned = pos.div_ceil(32) * 32; + file.write_all(&vec![0u8; (aligned - pos) as usize]) + .unwrap(); + + for &tid in tensor_ids { + for off in 0..4 { + file.write_all( + &((tid as f32) + 0.1 * off as f32).to_le_bytes(), + ) + .unwrap(); + } + } + file.flush().unwrap(); + path + }; + + let p1 = write_shard( + 1, + &[0, 1], + &[("split.no", 0), ("split.count", 2), ("split.tensors.count", 4)], + ); + let _p2 = write_shard( + 2, + &[2, 3], + &[("split.no", 1), ("split.count", 2), ("split.tensors.count", 4)], + ); + + let gguf = GgufFile::open(&p1).unwrap(); + assert_eq!(gguf.shards.len(), 2); + assert_eq!(gguf.tensor_infos.len(), 4); + for (i, info) in gguf.tensor_infos.iter().enumerate() { + let expected = if i < 2 { 0 } else { 1 }; + assert_eq!( + info.shard_idx, expected, + "tensor {i} ({}) shard mismatch", + info.name + ); + } + + let (tensors, _) = gguf.load_tensors().unwrap(); + assert_eq!(tensors.len(), 4); + for tid in 0..4 { + let key = format!("layers.{tid}.mlp.down_proj.weight"); + let arr = tensors.get(&key).unwrap_or_else(|| panic!("missing {key}")); + assert!( + (arr[[0, 0]] - tid as f32).abs() < 1e-6, + "tensor {tid} top-left {} != {tid}", + arr[[0, 0]] + ); + } + } + + #[test] + fn open_rejects_multi_shard_when_a_shard_file_is_missing() { + use std::io::Write; + let dir = tempfile::tempdir().unwrap(); + let p = dir.path().join("m-00001-of-00002.gguf"); + let mut file = std::fs::File::create(&p).unwrap(); + file.write_all(&GGUF_MAGIC.to_le_bytes()).unwrap(); + file.write_all(&3u32.to_le_bytes()).unwrap(); + file.write_all(&0u64.to_le_bytes()).unwrap(); + file.write_all(&1u64.to_le_bytes()).unwrap(); + let k = "split.count".as_bytes(); + file.write_all(&(k.len() as u64).to_le_bytes()).unwrap(); + file.write_all(k).unwrap(); + file.write_all(&4u32.to_le_bytes()).unwrap(); + file.write_all(&2u32.to_le_bytes()).unwrap(); + file.flush().unwrap(); + + let err = match GgufFile::open(&p) { + Ok(_) => panic!("expected error for missing sibling shard"), + Err(e) => e, + }; + assert!( + format!("{err}").contains("missing expected sibling"), + "unexpected error: {err}" + ); + } + /// Build a minimal GGUF file with one 2-D F32 tensor, but truncate the /// tensor data region so that `offset + size > file len`. Loader must /// reject this cleanly, not panic on a slice OOB.