fix: chunked SDPA correctness — scale, causal offset, precision, overflow

Thump604 · Thump604 · commit 8d4b37927866 · 2026-03-22T18:39:33.000-05:00
Fix four bugs in sdpa_full_self_attention_2pass and redesign the
reduction pipeline for numerical precision and memory scalability.

Bugs fixed:
1. Double M_LOG2E: host pre-multiplied scale by M_LOG2E, but the
   kernel already multiplies by M_LOG2E_F. Result was scale^2 * M_LOG2E^2
   instead of scale * M_LOG2E. Fix: pass raw scale (matches single-pass).

2. Wrong causal qL_off: formula was (k_start + chunk_kL) - qL = k_end - qL.
   Correct formula is (kL - qL) - k_start. The old formula broke causal
   masking completely — early chunks masked everything, late chunks nothing.

3. int32 overflow: B*H*qL*D overflows signed int32 at H=64, qL=131072,
   D=256 (exactly 2^31). Fix: use int64_t for bhq/bhqd.

4. simdgroup_barrier: guard was BD == 128, but BD=256 (head_dim=256 models)
   needs the same V@P read-after-write barrier. Fix: BD &gt;= 128.

Redesign: streaming merge with float32 accumulator
- Old: allocate [n_chunks, B, H, qL, D] partials in input dtype, then
  reduce all chunks at once. Memory scales linearly with chunk count,
  precision limited by half/bfloat16 round-trip.
- New: one chunk buffer (type T, reused) + one float32 accumulator.
  After each chunk's steel_attention dispatch, sdpa_full_merge folds
  results into the accumulator via online softmax. sdpa_full_finalize
  normalizes and writes output with correct stride layout (BLHD).
- Memory: O(B*H*qL*D) constant regardless of chunk count.
- Precision: float32 throughout accumulation, only final output cast to T.

Verified against manual float32 reference (matmul + softmax):
- Non-causal kL=65537: max_diff=0.000199, mean_diff=0.0000098
- Causal kL=65537: max_diff=0.008, mean_diff=0.0000193
- GQA D=256 causal: finite, correct magnitude
- 3-chunk kL=131073: finite, correct magnitude
diff --git a/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention.h b/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention.h
@@ -431,7 +431,7 @@ template <
       for (short id = 0; id < TD; id++) {
         STEEL_PRAGMA_UNROLL
         for (short ik = 0; ik < TK; ik++) {
-          if constexpr (BD == 128) {
+          if constexpr (BD >= 128) {
             simdgroup_barrier(mem_flags::mem_none);
           }
 
@@ -441,7 +441,7 @@ template <
           Vtile.template load<T, 1, 1, LDV_tgp, 1>(
               &Vs[Vs_offset + kk * LDV_tgp + dd]);
 
-          if constexpr (BD == 128) {
+          if constexpr (BD >= 128) {
             simdgroup_barrier(mem_flags::mem_none);
           }
 
diff --git a/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention_reduce.metal b/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention_reduce.metal
@@ -1,83 +1,164 @@
 // Copyright © 2026 Apple Inc. (contributed by Thump604)
 //
-// Reduction kernel for 2-pass chunked full attention.
-// Merges partial (O, max, sum) from multiple key-range chunks
-// using the online softmax algorithm.
+// Merge and finalize kernels for 2-pass chunked full attention.
+//
+// Streaming online softmax: after each chunk's steel_attention dispatch,
+// sdpa_full_merge folds the chunk's partial results into a float32
+// accumulator. After all chunks, sdpa_full_finalize normalizes and
+// writes the output in the caller's stride layout.
+//
+// Float32 accumulation eliminates the precision loss that would occur
+// from storing intermediate results in half/bfloat16.
 
 // clang-format off
 #include "mlx/backend/metal/kernels/utils.h"
 
 using namespace metal;
 
-// Merge partial attention results using online softmax.
-// For each query position:
-//   new_max = max(max_a, max_b)
-//   scale_a = exp2(max_a - new_max)
-//   scale_b = exp2(max_b - new_max)
-//   O = O_a * scale_a + O_b * scale_b
-//   sum = sum_a * scale_a + sum_b * scale_b
-// After all chunks merged: O_final = O / sum
+// ---------------------------------------------------------------------------
+// sdpa_full_merge — fold one chunk into the running float32 accumulator
+// ---------------------------------------------------------------------------
+//
+// Grid:  (D, qL, B*H) — one thread per output element.
+// Group: (D, 1, 1)    — all D threads for a query row in one threadgroup.
+//                        Requires D <= 256 (true for all current models).
+//
+// For the first chunk (is_first=1): copies chunk → accum.
+// For subsequent chunks: online softmax merge.
+//
+// A threadgroup_barrier separates reads of accum_maxs from the write by
+// thread 0, preventing a race between SIMD groups in the same threadgroup.
 
 template <typename T>
-[[kernel]] void sdpa_full_reduce(
-    const device T* partials [[buffer(0)]],     // [n_chunks, B*H*qL*D]
-    const device float* maxs [[buffer(1)]],     // [n_chunks, B*H*qL]
-    const device float* sums [[buffer(2)]],     // [n_chunks, B*H*qL]
-    device T* output [[buffer(3)]],             // [B*H*qL*D]
-    constant int& n_chunks [[buffer(4)]],
-    constant int& D [[buffer(5)]],
-    constant int& qL [[buffer(6)]],
-    uint3 tid [[thread_position_in_grid]],      // (d, q, bh)
-    uint3 grid [[threads_per_grid]]) {
-
-    int d_idx = tid.x;
-    int q_idx = tid.y;
-    int bh_idx = tid.z;
+[[kernel]] void sdpa_full_merge(
+    const device T* chunk_os [[buffer(0)]],
+    const device float* chunk_maxs [[buffer(1)]],
+    const device float* chunk_sums [[buffer(2)]],
+    device float* accum_os [[buffer(3)]],
+    device float* accum_maxs [[buffer(4)]],
+    device float* accum_sums [[buffer(5)]],
+    constant int& is_first [[buffer(6)]],
+    constant int& D [[buffer(7)]],
+    constant int& qL [[buffer(8)]],
+    uint3 tid [[thread_position_in_grid]],
+    uint tid_in_tg [[thread_index_in_threadgroup]]) {
+
+    const int d_idx = tid.x;
+    const int q_idx = tid.y;
+    const int bh_idx = tid.z;
 
     if (d_idx >= D || q_idx >= qL)
         return;
 
-    int bhq = bh_idx * qL + q_idx;
-    int bhqd = bhq * D + d_idx;
-
-    float running_max = -INFINITY;
-    float running_sum = 0.0f;
-    float running_o = 0.0f;
-
-    int chunk_stride_bhq = int(grid.z) * qL;
-    int chunk_stride_bhqd = int(grid.z) * qL * D;
-
-    for (int c = 0; c < n_chunks; c++) {
-        float chunk_max = maxs[c * chunk_stride_bhq + bhq];
-        float chunk_sum = sums[c * chunk_stride_bhq + bhq];
-        float chunk_o = float(partials[c * chunk_stride_bhqd + bhqd]);
-
-        if (c == 0) {
-            running_max = chunk_max;
-            running_sum = chunk_sum;
-            running_o = chunk_o;
-        } else {
-            float new_max = max(running_max, chunk_max);
-            float scale_old = fast::exp2(running_max - new_max);
-            float scale_new = fast::exp2(chunk_max - new_max);
-
-            running_o = running_o * scale_old + chunk_o * scale_new;
-            running_sum = running_sum * scale_old + chunk_sum * scale_new;
-            running_max = new_max;
+    const int bhq = bh_idx * qL + q_idx;
+    const long bhqd = long(bhq) * D + d_idx;
+
+    const float chunk_o = float(chunk_os[bhqd]);
+
+    if (is_first) {
+        accum_os[bhqd] = chunk_o;
+        if (d_idx == 0) {
+            accum_maxs[bhq] = chunk_maxs[bhq];
+            accum_sums[bhq] = chunk_sums[bhq];
         }
+        return;
+    }
+
+    // --- Online softmax merge (chunks 1+) ---
+
+    // Read shared per-row state before barrier-protected write below.
+    const float acc_max = accum_maxs[bhq];
+    const float acc_sum = accum_sums[bhq];
+    const float c_max = chunk_maxs[bhq];
+    const float c_sum = chunk_sums[bhq];
+
+    const float new_max = max(acc_max, c_max);
+    const float scale_old = fast::exp2(acc_max - new_max);
+    const float scale_new = fast::exp2(c_max - new_max);
+
+    // Per-element update — each thread writes a unique bhqd, no conflicts.
+    accum_os[bhqd] = accum_os[bhqd] * scale_old + chunk_o * scale_new;
+
+    // Barrier: all threads must have read the old accum_maxs/sums above
+    // before thread 0 overwrites them below.
+    threadgroup_barrier(mem_flags::mem_device);
+
+    if (d_idx == 0) {
+        accum_maxs[bhq] = new_max;
+        accum_sums[bhq] = acc_sum * scale_old + c_sum * scale_new;
     }
+}
+
+// ---------------------------------------------------------------------------
+// sdpa_full_finalize — normalize float32 accumulator → output in type T
+// ---------------------------------------------------------------------------
+//
+// Grid:  (D, qL, B*H) — one thread per output element.
+// Group: (min(D, 256), 1, 1).
+//
+// Handles the layout transposition from contiguous BHLD accumulator
+// to the caller's output stride layout (typically BLHD).
+
+template <typename T>
+[[kernel]] void sdpa_full_finalize(
+    const device float* accum_os [[buffer(0)]],
+    const device float* accum_sums [[buffer(1)]],
+    device T* output [[buffer(2)]],
+    constant int& D [[buffer(3)]],
+    constant int& H [[buffer(4)]],
+    constant int& qL [[buffer(5)]],
+    constant int64_t* O_strides [[buffer(6)]],
+    uint3 tid [[thread_position_in_grid]]) {
+
+    const int d_idx = tid.x;
+    const int q_idx = tid.y;
+    const int bh_idx = tid.z;
+
+    if (d_idx >= D || q_idx >= qL)
+        return;
+
+    const int b = bh_idx / H;
+    const int h = bh_idx % H;
+
+    // Contiguous BHLD index into accumulator
+    const int bhq = bh_idx * qL + q_idx;
+    const long bhqd = long(bhq) * D + d_idx;
 
-    output[bhqd] = T(running_o / running_sum);
+    // Strided index into output (may be BLHD or other layout)
+    const long out_idx = long(b) * O_strides[0] +
+                         long(h) * O_strides[1] +
+                         long(q_idx) * O_strides[2] +
+                         d_idx;
+
+    const float sum = accum_sums[bhq];
+    output[out_idx] = T(accum_os[bhqd] / sum);
 }
 
-#define instantiate_reduce(tname, dtype) \
-  template [[host_name("sdpa_full_reduce_" #tname)]] \
-  [[kernel]] void sdpa_full_reduce<dtype>( \
-      const device dtype*, const device float*, const device float*, \
-      device dtype*, constant int&, constant int&, constant int&, \
-      uint3, uint3);
+// ---------------------------------------------------------------------------
+// Template instantiations
+// ---------------------------------------------------------------------------
+
+#define instantiate_merge(tname, dtype)                                        \
+  template [[host_name("sdpa_full_merge_" #tname)]]                            \
+  [[kernel]] void sdpa_full_merge<dtype>(                                      \
+      const device dtype*, const device float*, const device float*,           \
+      device float*, device float*, device float*,                             \
+      constant int&, constant int&, constant int&,                             \
+      uint3, uint);
+
+#define instantiate_finalize(tname, dtype)                                     \
+  template [[host_name("sdpa_full_finalize_" #tname)]]                         \
+  [[kernel]] void sdpa_full_finalize<dtype>(                                   \
+      const device float*, const device float*,                                \
+      device dtype*,                                                           \
+      constant int&, constant int&, constant int&, constant int64_t*,          \
+      uint3);
+
+instantiate_merge(float16, half);
+instantiate_merge(bfloat16, bfloat16_t);
+instantiate_merge(float32, float);
 
-instantiate_reduce(float16, half);
-instantiate_reduce(bfloat16, bfloat16_t);
-instantiate_reduce(float32, float);
+instantiate_finalize(float16, half);
+instantiate_finalize(bfloat16, bfloat16_t);
+instantiate_finalize(float32, float);
 // clang-format on
diff --git a/mlx/backend/metal/scaled_dot_product_attention.cpp b/mlx/backend/metal/scaled_dot_product_attention.cpp

Original file line number	Diff line number	Diff line change
`@@ -431,7 +431,7 @@ template <`
`431`	`431`	`for (short id = 0; id < TD; id++) {`
`432`	`432`	`STEEL_PRAGMA_UNROLL`
`433`	`433`	`for (short ik = 0; ik < TK; ik++) {`
`434`		`- if constexpr (BD == 128) {`
	`434`	`+ if constexpr (BD >= 128) {`
`435`	`435`	`simdgroup_barrier(mem_flags::mem_none);`
`436`	`436`	`}`
`437`	`437`
`@@ -441,7 +441,7 @@ template <`
`441`	`441`	`Vtile.template load<T, 1, 1, LDV_tgp, 1>(`
`442`	`442`	`&Vs[Vs_offset + kk * LDV_tgp + dd]);`
`443`	`443`
`444`		`- if constexpr (BD == 128) {`
	`444`	`+ if constexpr (BD >= 128) {`
`445`	`445`	`simdgroup_barrier(mem_flags::mem_none);`
`446`	`446`	`}`
`447`	`447`