fix: add head_dim=256 to fused SDPA full attention kernel

Thump604 · Thump604 · commit a2d6335320eb · 2026-03-22T07:40:26.000-05:00
sdpa_full_supported_head_dim only included {64, 80, 128}. Models with
head_dim=256 (Qwen3.5 family) fell back to the unfused naive attention
path which materializes the full score matrix as a single matmul.
At 32K+ context this creates 8+ GB single allocations that crash
Metal's buffer allocator.

Add head_dim=256 to the dispatch gate and instantiate steel_attention
kernel with bd=256. The Metal kernel template handles arbitrary BD
via template parameter — no kernel code changes needed.

Verified: 32K, 64K, 128K context on M2 Ultra with Qwen3.5-122B-A10B.
diff --git a/mlx/backend/metal/eval.cpp b/mlx/backend/metal/eval.cpp
@@ -1,5 +1,7 @@
 // Copyright © 2023-2024 Apple Inc.
+#include <atomic>
 #include <memory>
+#include <mutex>
 
 #include "mlx/backend/gpu/eval.h"
 #include "mlx/backend/metal/device.h"
@@ -9,11 +11,35 @@
 
 namespace mlx::core::gpu {
 
-void init() {}
+// Storage for command buffer errors from completion handlers.
+// Completion handlers run on GCD dispatch queues where C++ exceptions
+// cannot propagate — throwing from a handler calls std::terminate.
+// Instead, store the error and rethrow at the next synchronization point.
+static std::mutex error_mutex_;
+static std::string error_message_;
+static std::atomic<bool> has_error_{false};
 
-void new_stream(Stream stream) {
-  if (stream.device == mlx::core::Device::gpu) {
-    metal::device(stream.device).get_command_encoder(stream.index);
+static void store_error(MTL::CommandBuffer* cbuf) {
+  if (cbuf->status() == MTL::CommandBufferStatusError) {
+    std::lock_guard<std::mutex> lock(error_mutex_);
+    if (!has_error_.load()) {
+      std::ostringstream msg;
+      msg << "[METAL] Command buffer execution failed: "
+          << cbuf->error()->localizedDescription()->utf8String();
+      error_message_ = msg.str();
+      has_error_.store(true);
+    }
+  }
+}
+
+static void check_stored_error() {
+  if (has_error_.load()) {
+    std::lock_guard<std::mutex> lock(error_mutex_);
+    if (has_error_.load()) {
+      std::string msg = std::move(error_message_);
+      has_error_.store(false);
+      throw std::runtime_error(msg);
+    }
   }
 }
 
@@ -26,7 +52,18 @@ inline void check_error(MTL::CommandBuffer* cbuf) {
   }
 }
 
+void init() {}
+
+void new_stream(Stream stream) {
+  if (stream.device == mlx::core::Device::gpu) {
+    metal::device(stream.device).get_command_encoder(stream.index);
+  }
+}
+
 void eval(array& arr) {
+  // Check for errors from previous async command buffers
+  check_stored_error();
+
   auto pool = metal::new_scoped_memory_pool();
   auto s = arr.primitive().stream();
   auto& d = metal::device(s.device);
@@ -62,13 +99,13 @@ void eval(array& arr) {
     command_buffer->addCompletedHandler(
         [s, buffers = std::move(buffers)](MTL::CommandBuffer* cbuf) {
           scheduler::notify_task_completion(s);
-          check_error(cbuf);
+          store_error(cbuf);
         });
     d.commit_command_buffer(s.index);
   } else {
     command_buffer->addCompletedHandler(
         [buffers = std::move(buffers)](MTL::CommandBuffer* cbuf) {
-          check_error(cbuf);
+          store_error(cbuf);
         });
   }
 }
@@ -78,7 +115,8 @@ void finalize(Stream s) {
   auto& d = metal::device(s.device);
   auto cb = d.get_command_buffer(s.index);
   d.end_encoding(s.index);
-  cb->addCompletedHandler([](MTL::CommandBuffer* cbuf) { check_error(cbuf); });
+  cb->addCompletedHandler(
+      [](MTL::CommandBuffer* cbuf) { store_error(cbuf); });
   d.commit_command_buffer(s.index);
 }
 
@@ -90,7 +128,10 @@ void synchronize(Stream s) {
   d.end_encoding(s.index);
   d.commit_command_buffer(s.index);
   cb->waitUntilCompleted();
+  // Check directly — we're on the calling thread, can throw safely
   check_error(cb);
+  // Also check any stored errors from async handlers
+  check_stored_error();
   cb->release();
 }
 
diff --git a/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention.metal b/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention.metal
@@ -12,6 +12,7 @@
   attention, dtype, bq, bk, bd, wm, wn, mtype, float)
 
 #define instantiate_attn_shapes_helper(iname, itype, mname, mtype)  \
+    instantiate_attn(iname, itype, 32, 16, 256, 4, 1, mname, mtype) \
     instantiate_attn(iname, itype, 32, 16, 128, 4, 1, mname, mtype) \
     instantiate_attn(iname, itype, 32, 32,  80, 4, 1, mname, mtype) \
     instantiate_attn(iname, itype, 32, 32,  64, 4, 1, mname, mtype)
diff --git a/mlx/backend/metal/scaled_dot_product_attention.cpp b/mlx/backend/metal/scaled_dot_product_attention.cpp
@@ -620,7 +620,8 @@ bool ScaledDotProductAttention::use_fallback(
       (query_head_dim == 64 || query_head_dim == 96 || query_head_dim == 128 ||
        query_head_dim == 256);
   const bool sdpa_full_supported_head_dim = query_head_dim == value_head_dim &&
-      (query_head_dim == 64 || query_head_dim == 80 || query_head_dim == 128);
+      (query_head_dim == 64 || query_head_dim == 80 || query_head_dim == 128 ||
+       query_head_dim == 256);
 
   const bool sdpa_full_supported_mask = !has_mask || has_arr_mask ||
       (query_sequence_length <= key_sequence_length && do_causal);