diff --git a/crates/larql-cli/src/commands/primary/run_cmd.rs b/crates/larql-cli/src/commands/primary/run_cmd.rs index 5e8f5bf36..4b98a49e4 100644 --- a/crates/larql-cli/src/commands/primary/run_cmd.rs +++ b/crates/larql-cli/src/commands/primary/run_cmd.rs @@ -334,6 +334,7 @@ pub fn run(args: RunArgs) -> Result<(), Box> { args.max_tokens, &args.moe_dispatch, args.moe_predispatch_iters, + args.metal, ); } @@ -463,6 +464,7 @@ fn run_with_moe_shards( max_tokens: usize, dispatch: &str, predispatch_iters: usize, + metal: bool, ) -> Result<(), Box> { use larql_inference::ffn::moe_remote::{parse_unit_manifest, RemoteMoeBackend, ShardConfig}; use larql_inference::{generate_with_remote_moe, generate_with_remote_moe_batch}; @@ -510,7 +512,22 @@ fn run_with_moe_shards( let num_shards = configs.len(); // Initialise compute backend early so we can report it in the topology banner. - let backend = larql_compute::default_backend(); + // Mirrors the `--metal` wiring in `run_with_remote_ffn` (PR #122): explicit + // opt-in via the CLI flag, with Metal-init failure falling back to CPU. + let backend: Box = if metal { + #[cfg(all(feature = "gpu", target_os = "macos"))] + { + larql_compute_metal::metal_backend() + .map(|m| Box::new(m) as Box) + .unwrap_or_else(larql_compute::cpu_backend) + } + #[cfg(not(all(feature = "gpu", target_os = "macos")))] + { + return Err("`--metal` requires the `gpu` feature on macOS".into()); + } + } else { + larql_compute::cpu_backend() + }; eprintln!("Connecting to {} MoE shard(s)…", num_shards); let remote = RemoteMoeBackend::connect(configs) .map_err(|e| format!("failed to connect to MoE shards: {e}"))?; diff --git a/crates/larql-compute/src/pipeline_layer.rs b/crates/larql-compute/src/pipeline_layer.rs index bc4ea4149..2250f69d5 100644 --- a/crates/larql-compute/src/pipeline_layer.rs +++ b/crates/larql-compute/src/pipeline_layer.rs @@ -417,6 +417,22 @@ pub fn resolve_ffn_weights<'a>( ); } + // Pure MoE vindexes (e.g. Gemma-4 26B A4B) have no dense FFN tensor, so + // `interleaved_q4k.bin` is 0 bytes. The interleaved-kquant branch above is + // also absent for these (no per-matrix manifest entries), so we fall through + // here with an empty `q4_ffn_mmap`. Return empty `QuantWeight` stubs — + // `patch_pipeline_layers_for_remote_moe` (below) overwrites the per-layer + // dense weights for MoE layers, and `moe_fn` supersedes the dense FFN path + // during decode, so the empty slices are never read. + if q4_ffn_mmap.is_empty() { + let empty = QuantWeight { + data: &[], + scales: None, + format: ffn_format, + }; + return (empty, empty, empty); + } + let q4_ffn_per_layer = q4_ffn_per_matrix * 3; let fs = layer * q4_ffn_per_layer; ( @@ -768,4 +784,30 @@ mod tests { assert!(l.ffn_is_remote, "patch should set ffn_is_remote = true"); } } + + /// Pure-MoE vindexes (e.g. Gemma-4 26B A4B) ship a zero-byte + /// `interleaved_q4k.bin` because there is no dense FFN. Before the + /// `is_empty()` guard, `resolve_ffn_weights` would panic on the first + /// `q4_ffn_mmap[fs..fs + q4_ffn_per_matrix]` slice. The guard returns + /// empty `QuantWeight` stubs — `patch_pipeline_layers_for_remote_moe` + /// overwrites the per-layer MoE weights afterward and the dense FFN + /// path is bypassed entirely by `moe_fn` during decode, so the empty + /// slices are never read. + #[test] + fn resolve_ffn_weights_returns_empty_stubs_when_q4_ffn_mmap_is_empty() { + struct EmptyIdx; + impl crate::KvIndex for EmptyIdx {} + let idx = EmptyIdx; + let empty_mmap: &[u8] = &[]; + // q4_ffn_per_matrix is irrelevant on this path — what we're pinning + // is "no slice happens against the empty mmap" (i.e. no panic). + let (gate, up, down) = + resolve_ffn_weights(&idx, 7, empty_mmap, 1_115_136, QuantFormat::Q4_K); + assert!(gate.data.is_empty()); + assert!(up.data.is_empty()); + assert!(down.data.is_empty()); + assert_eq!(gate.format, QuantFormat::Q4_K); + assert_eq!(up.format, QuantFormat::Q4_K); + assert_eq!(down.format, QuantFormat::Q4_K); + } }