chrishayuk · chrishayuk · May 27, 2026 · May 27, 2026
diff --git a/crates/larql-cli/src/commands/primary/run_cmd.rs b/crates/larql-cli/src/commands/primary/run_cmd.rs
@@ -334,6 +334,7 @@ pub fn run(args: RunArgs) -> Result<(), Box<dyn std::error::Error>> {
             args.max_tokens,
             &args.moe_dispatch,
             args.moe_predispatch_iters,
+            args.metal,
         );
     }
 
@@ -463,6 +464,7 @@ fn run_with_moe_shards(
     max_tokens: usize,
     dispatch: &str,
     predispatch_iters: usize,
+    metal: bool,
 ) -> Result<(), Box<dyn std::error::Error>> {
     use larql_inference::ffn::moe_remote::{parse_unit_manifest, RemoteMoeBackend, ShardConfig};
     use larql_inference::{generate_with_remote_moe, generate_with_remote_moe_batch};
@@ -510,7 +512,22 @@ fn run_with_moe_shards(
 
     let num_shards = configs.len();
     // Initialise compute backend early so we can report it in the topology banner.
-    let backend = larql_compute::default_backend();
+    // Mirrors the `--metal` wiring in `run_with_remote_ffn` (PR #122): explicit
+    // opt-in via the CLI flag, with Metal-init failure falling back to CPU.
+    let backend: Box<dyn larql_compute::ComputeBackend> = if metal {
+        #[cfg(all(feature = "gpu", target_os = "macos"))]
+        {
+            larql_compute_metal::metal_backend()
+                .map(|m| Box::new(m) as Box<dyn larql_compute::ComputeBackend>)
+                .unwrap_or_else(larql_compute::cpu_backend)
+        }
+        #[cfg(not(all(feature = "gpu", target_os = "macos")))]
+        {
+            return Err("`--metal` requires the `gpu` feature on macOS".into());
+        }
+    } else {
+        larql_compute::cpu_backend()
+    };
     eprintln!("Connecting to {} MoE shard(s)…", num_shards);
     let remote = RemoteMoeBackend::connect(configs)
         .map_err(|e| format!("failed to connect to MoE shards: {e}"))?;

diff --git a/crates/larql-compute/src/pipeline_layer.rs b/crates/larql-compute/src/pipeline_layer.rs
@@ -417,6 +417,22 @@ pub fn resolve_ffn_weights<'a>(
         );
     }
 
+    // Pure MoE vindexes (e.g. Gemma-4 26B A4B) have no dense FFN tensor, so
+    // `interleaved_q4k.bin` is 0 bytes. The interleaved-kquant branch above is
+    // also absent for these (no per-matrix manifest entries), so we fall through
+    // here with an empty `q4_ffn_mmap`. Return empty `QuantWeight` stubs —
+    // `patch_pipeline_layers_for_remote_moe` (below) overwrites the per-layer
+    // dense weights for MoE layers, and `moe_fn` supersedes the dense FFN path
+    // during decode, so the empty slices are never read.
+    if q4_ffn_mmap.is_empty() {
+        let empty = QuantWeight {
+            data: &[],
+            scales: None,
+            format: ffn_format,
+        };
+        return (empty, empty, empty);
+    }
+
     let q4_ffn_per_layer = q4_ffn_per_matrix * 3;
     let fs = layer * q4_ffn_per_layer;
     (
@@ -768,4 +784,30 @@ mod tests {
             assert!(l.ffn_is_remote, "patch should set ffn_is_remote = true");
         }
     }
+
+    /// Pure-MoE vindexes (e.g. Gemma-4 26B A4B) ship a zero-byte
+    /// `interleaved_q4k.bin` because there is no dense FFN. Before the
+    /// `is_empty()` guard, `resolve_ffn_weights` would panic on the first
+    /// `q4_ffn_mmap[fs..fs + q4_ffn_per_matrix]` slice. The guard returns
+    /// empty `QuantWeight` stubs — `patch_pipeline_layers_for_remote_moe`
+    /// overwrites the per-layer MoE weights afterward and the dense FFN
+    /// path is bypassed entirely by `moe_fn` during decode, so the empty
+    /// slices are never read.
+    #[test]
+    fn resolve_ffn_weights_returns_empty_stubs_when_q4_ffn_mmap_is_empty() {
+        struct EmptyIdx;
+        impl crate::KvIndex for EmptyIdx {}
+        let idx = EmptyIdx;
+        let empty_mmap: &[u8] = &[];
+        // q4_ffn_per_matrix is irrelevant on this path — what we're pinning
+        // is "no slice happens against the empty mmap" (i.e. no panic).
+        let (gate, up, down) =
+            resolve_ffn_weights(&idx, 7, empty_mmap, 1_115_136, QuantFormat::Q4_K);
+        assert!(gate.data.is_empty());
+        assert!(up.data.is_empty());
+        assert!(down.data.is_empty());
+        assert_eq!(gate.format, QuantFormat::Q4_K);
+        assert_eq!(up.format, QuantFormat::Q4_K);
+        assert_eq!(down.format, QuantFormat::Q4_K);
+    }
 }