feat: chunked full-attention SDPA to avoid GPU watchdog timeout

Thump604 · Thump604 · commit 4c39174dbeea · 2026-03-22T17:06:49.000-05:00
The steel_attention kernel processes all keys in a single Metal
dispatch. At 65K+ key sequence length, the dispatch can exceed the
macOS GPU watchdog threshold (~5s), causing
kIOGPUCommandBufferCallbackErrorImpactingInteractivity and process
termination.

Add 2-pass chunked full attention (sdpa_full_self_attention_2pass):
- Splits key sequence into 65K-token chunks
- Each chunk dispatches the existing steel_attention kernel with
  write_partial=true, outputting unnormalized O + partial max/sum
- Reduction kernel (sdpa_full_reduce) merges chunk results via
  online softmax: O_final = sum(O_chunk * exp2(max_chunk - max_all))
- Routes automatically when kL &gt;= 65536

Verified: Qwen3.5-2B at 512K context completes successfully (was
crashing with GPU watchdog before this fix). TTFT=887s, 36.7 tok/s.

The existing 1-pass path is unchanged for kL &lt; 65536.
The vector SDPA (qL &lt;= 8, decode) already had 2-pass chunking.

Files:
- params.h: add chunked attention fields to AttnParams
- steel_attention.h: add write_partial function constant for partial
  output mode (unnormalized O + max/sum to global memory)
- steel_attention_reduce.metal: new reduction kernel for merging
  chunk results via online softmax
- scaled_dot_product_attention.cpp: add sdpa_full_self_attention_2pass
  host function, route kL &gt;= 65536 to it
- CMakeLists.txt: build reduction kernel
diff --git a/mlx/backend/metal/kernels/CMakeLists.txt b/mlx/backend/metal/kernels/CMakeLists.txt
@@ -153,6 +153,7 @@ if(NOT MLX_METAL_JIT)
   build_kernel(steel/gemm/kernels/steel_gemm_segmented ${STEEL_HEADERS})
   build_kernel(gemv_masked steel/utils.h)
   build_kernel(steel/attn/kernels/steel_attention ${STEEL_ATTN_HEADERS})
+  build_kernel(steel/attn/kernels/steel_attention_reduce)
 
   if((MLX_METAL_VERSION GREATER_EQUAL 400) AND (MACOS_SDK_VERSION GREATER_EQUAL
                                                 26.2))
diff --git a/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention.h b/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention.h
@@ -14,6 +14,7 @@ constant bool align_K [[function_constant(201)]];
 constant bool has_mask [[function_constant(300)]];
 constant bool do_causal [[function_constant(301)]];
 constant bool has_sinks [[function_constant(302)]];
+constant bool write_partial [[function_constant(303)]];
 
 struct MaxOp {
   template <typename T>
@@ -76,6 +77,8 @@ template <
     const constant AttnMaskParams* mask_params [[buffer(5), function_constant(has_mask)]],
     const device MaskType* mask [[buffer(6), function_constant(has_mask)]],
     const device T* sinks [[buffer(7), function_constant(has_sinks)]],
+    device float* partial_maxs [[buffer(8), function_constant(write_partial)]],
+    device float* partial_sums [[buffer(9), function_constant(write_partial)]],
     uint simd_lane_id [[thread_index_in_simdgroup]],
     uint simd_group_id [[simdgroup_index_in_threadgroup]],
     uint3 tid [[threadgroup_position_in_grid]],
@@ -456,21 +459,54 @@ template <
     loader_v.next();
   }
 
-  // Normalize output
-  Otile.template row_bin_op<DivOp>(sum_score);
-  threadgroup_barrier(mem_flags::mem_none);
+  if (write_partial) {
+    // Write unnormalized O, max_score, sum_score for 2-pass reduction.
+    // O is NOT divided by sum — the reduction kernel handles normalization.
+    threadgroup_barrier(mem_flags::mem_none);
 
-  // Store results
-  O += (tm + sm) * params->O_strides[2] + sn;
+    O += (tm + sm) * params->O_strides[2] + sn;
 
-  if (!align_Q && int(tid.x) == (params->NQ_aligned)) {
-    auto dst_tile_dims = short2(BD - sn, params->qL_rem - (tm + sm));
+    if (!align_Q && int(tid.x) == (params->NQ_aligned)) {
+      auto dst_tile_dims = short2(BD - sn, params->qL_rem - (tm + sm));
+      if (dst_tile_dims.x > 0 && dst_tile_dims.y > 0) {
+        Otile.template store_safe<T, 1, 1>(O, params->O_strides[2], dst_tile_dims);
+      }
+    } else {
+      Otile.template store<T, 1, 1>(O, params->O_strides[2]);
+    }
 
-    if (dst_tile_dims.x <= 0 || dst_tile_dims.y <= 0)
-      return;
+    // Write per-row max and sum to global memory.
+    // Layout: [B, H, qL] — one value per query position.
+    // Each thread writes its owned rows (determined by simd position).
+    int base_row = int(tid.x) * BQ + tm + sm;
+    int global_idx = int(tidl.z) * params->H * params->qL +
+                     int(tidl.y) * params->qL + base_row;
 
-    Otile.template store_safe<T, 1, 1>(O, params->O_strides[2], dst_tile_dims);
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < kRowsPT; ++i) {
+      int row = base_row + i * kFragSize;
+      if (row < params->qL) {
+        int idx = global_idx + i * kFragSize;
+        partial_maxs[idx] = max_score[i];
+        partial_sums[idx] = sum_score[i];
+      }
+    }
   } else {
-    Otile.template store<T, 1, 1>(O, params->O_strides[2]);
+    // Normal path: normalize and write final output.
+    Otile.template row_bin_op<DivOp>(sum_score);
+    threadgroup_barrier(mem_flags::mem_none);
+
+    O += (tm + sm) * params->O_strides[2] + sn;
+
+    if (!align_Q && int(tid.x) == (params->NQ_aligned)) {
+      auto dst_tile_dims = short2(BD - sn, params->qL_rem - (tm + sm));
+
+      if (dst_tile_dims.x <= 0 || dst_tile_dims.y <= 0)
+        return;
+
+      Otile.template store_safe<T, 1, 1>(O, params->O_strides[2], dst_tile_dims);
+    } else {
+      Otile.template store<T, 1, 1>(O, params->O_strides[2]);
+    }
   }
 }
diff --git a/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention_reduce.metal b/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention_reduce.metal
@@ -0,0 +1,83 @@
+// Copyright © 2026 Apple Inc. (contributed by Thump604)
+//
+// Reduction kernel for 2-pass chunked full attention.
+// Merges partial (O, max, sum) from multiple key-range chunks
+// using the online softmax algorithm.
+
+// clang-format off
+#include "mlx/backend/metal/kernels/utils.h"
+
+using namespace metal;
+
+// Merge partial attention results using online softmax.
+// For each query position:
+//   new_max = max(max_a, max_b)
+//   scale_a = exp2(max_a - new_max)
+//   scale_b = exp2(max_b - new_max)
+//   O = O_a * scale_a + O_b * scale_b
+//   sum = sum_a * scale_a + sum_b * scale_b
+// After all chunks merged: O_final = O / sum
+
+template <typename T>
+[[kernel]] void sdpa_full_reduce(
+    const device T* partials [[buffer(0)]],     // [n_chunks, B*H*qL*D]
+    const device float* maxs [[buffer(1)]],     // [n_chunks, B*H*qL]
+    const device float* sums [[buffer(2)]],     // [n_chunks, B*H*qL]
+    device T* output [[buffer(3)]],             // [B*H*qL*D]
+    constant int& n_chunks [[buffer(4)]],
+    constant int& D [[buffer(5)]],
+    constant int& qL [[buffer(6)]],
+    uint3 tid [[thread_position_in_grid]],      // (d, q, bh)
+    uint3 grid [[threads_per_grid]]) {
+
+    int d_idx = tid.x;
+    int q_idx = tid.y;
+    int bh_idx = tid.z;
+
+    if (d_idx >= D || q_idx >= qL)
+        return;
+
+    int bhq = bh_idx * qL + q_idx;
+    int bhqd = bhq * D + d_idx;
+
+    float running_max = -INFINITY;
+    float running_sum = 0.0f;
+    float running_o = 0.0f;
+
+    int chunk_stride_bhq = int(grid.z) * qL;
+    int chunk_stride_bhqd = int(grid.z) * qL * D;
+
+    for (int c = 0; c < n_chunks; c++) {
+        float chunk_max = maxs[c * chunk_stride_bhq + bhq];
+        float chunk_sum = sums[c * chunk_stride_bhq + bhq];
+        float chunk_o = float(partials[c * chunk_stride_bhqd + bhqd]);
+
+        if (c == 0) {
+            running_max = chunk_max;
+            running_sum = chunk_sum;
+            running_o = chunk_o;
+        } else {
+            float new_max = max(running_max, chunk_max);
+            float scale_old = fast::exp2(running_max - new_max);
+            float scale_new = fast::exp2(chunk_max - new_max);
+
+            running_o = running_o * scale_old + chunk_o * scale_new;
+            running_sum = running_sum * scale_old + chunk_sum * scale_new;
+            running_max = new_max;
+        }
+    }
+
+    output[bhqd] = T(running_o / running_sum);
+}
+
+#define instantiate_reduce(tname, dtype) \
+  template [[host_name("sdpa_full_reduce_" #tname)]] \
+  [[kernel]] void sdpa_full_reduce<dtype>( \
+      const device dtype*, const device float*, const device float*, \
+      device dtype*, constant int&, constant int&, constant int&, \
+      uint3, uint3);
+
+instantiate_reduce(float16, half);
+instantiate_reduce(bfloat16, bfloat16_t);
+instantiate_reduce(float32, float);
+// clang-format on
diff --git a/mlx/backend/metal/kernels/steel/attn/params.h b/mlx/backend/metal/kernels/steel/attn/params.h
@@ -34,6 +34,14 @@ struct AttnParams {
   int64_t K_strides[3]; ///< Key    strides (B, H, L, D = 1)
   int64_t V_strides[3]; ///< Value  strides (B, H, L, D = 1)
   int64_t O_strides[3]; ///< Output strides (B, H, L, D = 1)
+
+  // Chunked attention parameters (for 2-pass to avoid GPU watchdog).
+  // When nk_chunk_end > 0, the kernel processes only keys in
+  // [nk_chunk_start, nk_chunk_end) and writes partial softmax state
+  // to intermediate buffers. The reduction pass merges chunk results.
+  int nk_chunk_start; ///< First key block to process (0 = from beginning)
+  int nk_chunk_end;   ///< Last key block (exclusive, 0 = use NK)
+  int chunk_idx;      ///< Index of this chunk (for indexing intermediates)
 };
 
 struct AttnMaskParams {
diff --git a/mlx/backend/metal/scaled_dot_product_attention.cpp b/mlx/backend/metal/scaled_dot_product_attention.cpp