Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion crates/larql-cli/src/commands/primary/run_cmd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,7 @@ pub fn run(args: RunArgs) -> Result<(), Box<dyn std::error::Error>> {
args.max_tokens,
&args.moe_dispatch,
args.moe_predispatch_iters,
args.metal,
);
}

Expand Down Expand Up @@ -463,6 +464,7 @@ fn run_with_moe_shards(
max_tokens: usize,
dispatch: &str,
predispatch_iters: usize,
metal: bool,
) -> Result<(), Box<dyn std::error::Error>> {
use larql_inference::ffn::moe_remote::{parse_unit_manifest, RemoteMoeBackend, ShardConfig};
use larql_inference::{generate_with_remote_moe, generate_with_remote_moe_batch};
Expand Down Expand Up @@ -510,7 +512,22 @@ fn run_with_moe_shards(

let num_shards = configs.len();
// Initialise compute backend early so we can report it in the topology banner.
let backend = larql_compute::default_backend();
// Mirrors the `--metal` wiring in `run_with_remote_ffn` (PR #122): explicit
// opt-in via the CLI flag, with Metal-init failure falling back to CPU.
let backend: Box<dyn larql_compute::ComputeBackend> = if metal {
#[cfg(all(feature = "gpu", target_os = "macos"))]
{
larql_compute_metal::metal_backend()
.map(|m| Box::new(m) as Box<dyn larql_compute::ComputeBackend>)
.unwrap_or_else(larql_compute::cpu_backend)
}
#[cfg(not(all(feature = "gpu", target_os = "macos")))]
{
return Err("`--metal` requires the `gpu` feature on macOS".into());
}
} else {
larql_compute::cpu_backend()
};
eprintln!("Connecting to {} MoE shard(s)…", num_shards);
let remote = RemoteMoeBackend::connect(configs)
.map_err(|e| format!("failed to connect to MoE shards: {e}"))?;
Expand Down
42 changes: 42 additions & 0 deletions crates/larql-compute/src/pipeline_layer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,22 @@ pub fn resolve_ffn_weights<'a>(
);
}

// Pure MoE vindexes (e.g. Gemma-4 26B A4B) have no dense FFN tensor, so
// `interleaved_q4k.bin` is 0 bytes. The interleaved-kquant branch above is
// also absent for these (no per-matrix manifest entries), so we fall through
// here with an empty `q4_ffn_mmap`. Return empty `QuantWeight` stubs —
// `patch_pipeline_layers_for_remote_moe` (below) overwrites the per-layer
// dense weights for MoE layers, and `moe_fn` supersedes the dense FFN path
// during decode, so the empty slices are never read.
if q4_ffn_mmap.is_empty() {
let empty = QuantWeight {
data: &[],
scales: None,
format: ffn_format,
};
return (empty, empty, empty);
}

let q4_ffn_per_layer = q4_ffn_per_matrix * 3;
let fs = layer * q4_ffn_per_layer;
(
Expand Down Expand Up @@ -768,4 +784,30 @@ mod tests {
assert!(l.ffn_is_remote, "patch should set ffn_is_remote = true");
}
}

/// Pure-MoE vindexes (e.g. Gemma-4 26B A4B) ship a zero-byte
/// `interleaved_q4k.bin` because there is no dense FFN. Before the
/// `is_empty()` guard, `resolve_ffn_weights` would panic on the first
/// `q4_ffn_mmap[fs..fs + q4_ffn_per_matrix]` slice. The guard returns
/// empty `QuantWeight` stubs — `patch_pipeline_layers_for_remote_moe`
/// overwrites the per-layer MoE weights afterward and the dense FFN
/// path is bypassed entirely by `moe_fn` during decode, so the empty
/// slices are never read.
#[test]
fn resolve_ffn_weights_returns_empty_stubs_when_q4_ffn_mmap_is_empty() {
struct EmptyIdx;
impl crate::KvIndex for EmptyIdx {}
let idx = EmptyIdx;
let empty_mmap: &[u8] = &[];
// q4_ffn_per_matrix is irrelevant on this path — what we're pinning
// is "no slice happens against the empty mmap" (i.e. no panic).
let (gate, up, down) =
resolve_ffn_weights(&idx, 7, empty_mmap, 1_115_136, QuantFormat::Q4_K);
assert!(gate.data.is_empty());
assert!(up.data.is_empty());
assert!(down.data.is_empty());
assert_eq!(gate.format, QuantFormat::Q4_K);
assert_eq!(up.format, QuantFormat::Q4_K);
assert_eq!(down.format, QuantFormat::Q4_K);
}
}
Loading