add causal_upper_left mask option to scaled_dot_product_attention

mm65x · mm65x · commit 6b2d5879a468 · 2026-03-14T23:07:30.000Z
diff --git a/mlx/backend/metal/kernels/sdpa_vector.h b/mlx/backend/metal/kernels/sdpa_vector.h
@@ -36,6 +36,7 @@ template <typename T, int D, int V = D>
     const device T* sinks [[buffer(16), function_constant(has_sinks)]],
     const constant int& num_q_heads
     [[buffer(17), function_constant(has_sinks)]],
+    const constant int& causal_offset [[buffer(18)]],
     uint3 tid [[threadgroup_position_in_grid]],
     uint3 tpg [[threadgroups_per_grid]],
     uint simd_gid [[simdgroup_index_in_threadgroup]],
@@ -99,7 +100,7 @@ template <typename T, int D, int V = D>
   for (int i = simd_gid; i < N; i += BN) {
     bool use_key = true;
     if (do_causal) {
-      use_key = i <= (N - int(tpg.y) + int(q_seq_idx));
+      use_key = i <= (causal_offset + int(q_seq_idx));
     } else if (bool_mask) {
       use_key = bmask[0];
     } else if (float_mask) {
@@ -199,6 +200,7 @@ template <typename T, int D, int V = D>
     const constant int& mask_head_stride
     [[buffer(17), function_constant(has_mask)]],
     const device T* sinks [[buffer(18), function_constant(has_sinks)]],
+    const constant int& causal_offset [[buffer(19)]],
     uint3 tptg [[threads_per_threadgroup]],
     uint3 tidtg [[thread_position_in_threadgroup]],
     uint3 tid [[threadgroup_position_in_grid]],
@@ -263,7 +265,7 @@ template <typename T, int D, int V = D>
   for (int i = block_idx; i < N; i += blocks) {
     bool use_key = true;
     if (do_causal) {
-      use_key = i <= (N - q_seq_len + int(q_seq_idx));
+      use_key = i <= (causal_offset + int(q_seq_idx));
     } else if (bool_mask) {
       use_key = bmask[0];
     } else if (float_mask) {
diff --git a/mlx/backend/metal/scaled_dot_product_attention.cpp b/mlx/backend/metal/scaled_dot_product_attention.cpp
@@ -24,6 +24,7 @@ void sdpa_full_self_attention_nax(
     const float scale,
     array& o,
     bool do_causal_,
+    bool causal_upper_left,
     const std::optional<array>& mask,
     const std::optional<array>& sinks) {
   using namespace mlx::steel;
@@ -131,7 +132,7 @@ void sdpa_full_self_attention_nax(
 
       /* int qL_rem = */ (qL - NQ_aligned * bq),
       /* int kL_rem = */ (kL - NK_aligned * bk),
-      /* int qL_off = */ (kL - qL),
+      /* int qL_off = */ (causal_upper_left ? 0 : kL - qL),
 
       /* int64_t Q_strides[3] = */ {q.strides(0), q.strides(1), q.strides(2)},
       /* int64_t K_strides[3] = */ {k.strides(0), k.strides(1), k.strides(2)},
@@ -172,6 +173,7 @@ void sdpa_full_self_attention_metal(
     const float scale,
     array& o,
     bool do_causal_,
+    bool causal_upper_left,
     const std::optional<array>& mask,
     const std::optional<array>& sinks) {
   if (metal::is_nax_available() && q.shape(3) != 80 &&
@@ -185,6 +187,7 @@ void sdpa_full_self_attention_metal(
         /* const float scale = */ scale,
         /* array& o = */ o,
         /* bool do_causal_ = */ do_causal_,
+        /* bool causal_upper_left = */ causal_upper_left,
         /* const std::optional<array>& mask = */ mask,
         /* const std::optional<array>& sinks = */ sinks);
   }
@@ -294,7 +297,7 @@ void sdpa_full_self_attention_metal(
 
       /* int qL_rem = */ (qL - NQ_aligned * bq),
       /* int kL_rem = */ (kL - NK_aligned * bk),
-      /* int qL_off = */ (kL - qL),
+      /* int qL_off = */ (causal_upper_left ? 0 : kL - qL),
 
       /* int64_t Q_strides[3] = */ {q.strides(0), q.strides(1), q.strides(2)},
       /* int64_t K_strides[3] = */ {k.strides(0), k.strides(1), k.strides(2)},
@@ -335,6 +338,7 @@ void sdpa_vector(
     array& out,
     float scale,
     bool do_causal,
+    bool causal_upper_left,
     const std::optional<array>& mask,
     const std::optional<array>& sinks) {
   // Set the kernel name
@@ -410,6 +414,8 @@ void sdpa_vector(
     compute_encoder.set_input_array(*sinks, 16);
     compute_encoder.set_bytes(q.shape(1), 17);
   }
+  int32_t causal_offset = causal_upper_left ? 0 : N - q.shape(2);
+  compute_encoder.set_bytes(causal_offset, 18);
 
   // Launch
   compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
@@ -424,6 +430,7 @@ void sdpa_vector_2pass(
     array& out,
     float scale,
     bool do_causal,
+    bool causal_upper_left,
     const std::optional<array>& mask,
     const std::optional<array>& sinks) {
   // Set the kernel name
@@ -554,6 +561,8 @@ void sdpa_vector_2pass(
   if (has_sinks) {
     compute_encoder.set_input_array(*sinks, 18);
   }
+  int32_t causal_offset = causal_upper_left ? 0 : N - q.shape(2);
+  compute_encoder.set_bytes(causal_offset, 19);
 
   // Launch
   compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
@@ -744,9 +753,11 @@ void ScaledDotProductAttention::eval_gpu(
     char devc = d.get_architecture().back();
     if (((devc == 'd' || devc == 's') && k.shape(2) >= 1024) ||
         (k.shape(1) < q.shape(1) && k.shape(2) >= 4096)) {
-      sdpa_vector_2pass(s, d, q, k, v, o, scale_, do_causal, mask, sinks);
+      sdpa_vector_2pass(
+          s, d, q, k, v, o, scale_, do_causal, causal_upper_left_, mask, sinks);
     } else {
-      sdpa_vector(s, d, q, k, v, o, scale_, do_causal, mask, sinks);
+      sdpa_vector(
+          s, d, q, k, v, o, scale_, do_causal, causal_upper_left_, mask, sinks);
     }
   }
 
@@ -779,7 +790,7 @@ void ScaledDotProductAttention::eval_gpu(
         : std::nullopt;
 
     sdpa_full_self_attention_metal(
-        s, d, q, k, v, scale_, o, do_causal_, mask, sinks);
+        s, d, q, k, v, scale_, o, do_causal_, causal_upper_left_, mask, sinks);
   }
 
   d.add_temporaries(std::move(copies), s.index);
diff --git a/mlx/fast.cpp b/mlx/fast.cpp
@@ -628,26 +628,31 @@ array scaled_dot_product_attention(
     }
   }
   // Check valid mask
-  if (mask_mode != "" && mask_mode != "causal" && mask_mode != "array") {
+  bool is_causal_mode = mask_mode == "causal" ||
+      mask_mode == "causal_lower_right" || mask_mode == "causal_upper_left";
+  if (mask_mode != "" && !is_causal_mode && mask_mode != "array") {
     std::ostringstream msg;
-    msg << "[scaled_dot_product_attention] Invalid mask_mode " << mask_mode
-        << ". mask_mode must be 'causal', 'array' or ''.";
+    msg << "[scaled_dot_product_attention] Invalid mask_mode '" << mask_mode
+        << "'. Must be 'causal', 'causal_lower_right', "
+        << "'causal_upper_left', 'array' or ''.";
     throw std::invalid_argument(msg.str());
   }
 
   bool do_causal = false;
+  bool causal_upper_left = false;
   bool has_mask = false;
   bool has_arr_mask = false;
   bool has_bool_mask = false;
 
-  if (mask_mode == "causal") {
+  if (is_causal_mode) {
     has_mask = true;
     do_causal = true;
+    causal_upper_left = (mask_mode == "causal_upper_left");
 
     if (mask_arr) {
       std::ostringstream msg;
       msg << "[scaled_dot_product_attention] Invalid mask_arr for mask_mode "
-          << "'casusal'. No array mask should be passed.";
+          << "'" << mask_mode << "'. No array mask should be passed.";
       throw std::invalid_argument(msg.str());
     }
   } else if (mask_arr) {
@@ -718,6 +723,7 @@ array scaled_dot_product_attention(
                    n_q_heads,
                    n_kv_heads,
                    do_causal,
+                   causal_upper_left,
                    has_sinks,
                    has_arr_mask,
                    s](const std::vector<array>& inputs) {
@@ -737,7 +743,7 @@ array scaled_dot_product_attention(
         if (do_causal) {
           int kL = k.shape(-2);
           int qL = q.shape(-2);
-          int offset = kL - qL;
+          int offset = causal_upper_left ? 0 : kL - qL;
           auto q_idx = arange(offset, qL + offset, s);
           auto k_idx = arange(0, kL, s);
           q_idx = expand_dims(q_idx, 1, s);
@@ -846,7 +852,13 @@ array scaled_dot_product_attention(
     }
     Shape out_shape{q.shape(0), q.shape(1), q.shape(2), v.shape(-1)};
     auto primitive = std::make_shared<ScaledDotProductAttention>(
-        stream, fallback, scale, do_causal, has_sinks, output_logsumexp);
+        stream,
+        fallback,
+        scale,
+        do_causal,
+        causal_upper_left,
+        has_sinks,
+        output_logsumexp);
     if (output_logsumexp) {
       return array::make_arrays(
           {std::move(out_shape), Shape{q.shape(0), q.shape(1), q.shape(2), 1}},
@@ -911,6 +923,7 @@ bool ScaledDotProductAttention::is_equivalent(const Primitive& other) const {
   const ScaledDotProductAttention& a_other =
       static_cast<const ScaledDotProductAttention&>(other);
   return scale_ == a_other.scale_ && do_causal_ == a_other.do_causal_ &&
+      causal_upper_left_ == a_other.causal_upper_left_ &&
       has_sinks_ == a_other.has_sinks_ &&
       output_logsumexp_ == a_other.output_logsumexp_;
 }
diff --git a/mlx/fast_primitives.h b/mlx/fast_primitives.h
@@ -210,11 +210,13 @@ class ScaledDotProductAttention : public Custom {
       std::function<std::vector<array>(std::vector<array>)> fallback,
       float scale,
       bool do_causal,
+      bool causal_upper_left,
       bool has_sinks,
       bool output_logsumexp)
       : Custom(stream, std::move(fallback)),
         scale_(scale),
         do_causal_(do_causal),
+        causal_upper_left_(causal_upper_left),
         has_sinks_(has_sinks),
         output_logsumexp_(output_logsumexp) {}
 
@@ -250,12 +252,18 @@ class ScaledDotProductAttention : public Custom {
   DEFINE_INPUT_OUTPUT_SHAPE()
   auto state() const {
     return std::make_tuple(
-        nullptr, scale_, do_causal_, has_sinks_, output_logsumexp_);
+        nullptr,
+        scale_,
+        do_causal_,
+        causal_upper_left_,
+        has_sinks_,
+        output_logsumexp_);
   }
 
  private:
   float scale_;
   bool do_causal_;
+  bool causal_upper_left_;
   bool has_sinks_;
   bool output_logsumexp_;
 };
diff --git a/python/src/fast.cpp b/python/src/fast.cpp
@@ -206,10 +206,13 @@ void init_fast(nb::module_& parent_module) {
         if (has_mask) {
           if (has_str_mask) {
             auto mask_str = std::get<std::string>(mask);
-            if (mask_str != "causal") {
+            if (mask_str != "causal" && mask_str != "causal_lower_right" &&
+                mask_str != "causal_upper_left") {
               std::ostringstream msg;
               msg << "[scaled_dot_product_attention] invalid mask option '"
-                  << mask_str << "'. Must be 'causal', or an array.";
+                  << mask_str
+                  << "'. Must be 'causal', 'causal_lower_right', "
+                  << "'causal_upper_left', or an array.";
               throw std::invalid_argument(msg.str());
             }
             return mx::fast::scaled_dot_product_attention(
@@ -267,13 +270,20 @@ void init_fast(nb::module_& parent_module) {
             scale (float): Scale for queries (typically ``1.0 / sqrt(q.shape(-1)``).
             mask (str or array, optional): The mask to apply to the
                query-key scores. The mask can be an array or a string indicating
-               the mask type. The only supported string type is ``"causal"``. If
-               the mask is an array it can be a boolean or additive mask. The mask
-               can have at most 4 dimensions and must be broadcast-compatible with
-               the shape ``[B, N, T_q, T_kv]``. If an additive mask is given its
-               type must promote to the promoted type of ``q``, ``k``, and ``v``.
-               The ``"causal"`` mask uses lower-right alignment where the
-               last query aligns with the last key.
+               the mask type. Supported string types are:
+
+               * ``"causal"`` or ``"causal_lower_right"``: Lower-right
+                 aligned causal mask. The last query attends to the last key.
+                 This is the standard mask for autoregressive decoding.
+               * ``"causal_upper_left"``: Upper-left aligned causal mask.
+                 Query ``i`` attends to keys ``0..i``. This matches PyTorch's
+                 default ``is_causal=True`` behavior.
+
+               If the mask is an array it can be a boolean or additive mask.
+               The mask can have at most 4 dimensions and must be
+               broadcast-compatible with the shape ``[B, N, T_q, T_kv]``. If
+               an additive mask is given its type must promote to the promoted
+               type of ``q``, ``k``, and ``v``.
             sinks (array, optional): An optional array of attention sinks.
                Default: ``None``.
 
diff --git a/python/tests/test_fast_sdpa.py b/python/tests/test_fast_sdpa.py
@@ -26,7 +26,6 @@ def mlx_ref_attn(q, k, v, scale=1.0, mask=None, sinks=None):
     scores = q @ mx.swapaxes(k, -1, -2)
     is_causal = mask == "causal"
     if mask is not None:
-
         if is_causal:
             offset = kL - L
             q_indices = mx.arange(L) + offset
@@ -642,6 +641,50 @@ def test_sdpa_sliced(self):
                         tolerance = {"rtol": 1e-2, "atol": 1e-2}
                     self.assertTrue(mx.allclose(ref, out, **tolerance))
 
+    def test_causal_mask_alignment(self):
+        B, H, D = 1, 2, 64
+        qL, kL = 4, 8
+        scale = 1.0 / math.sqrt(D)
+
+        mx.random.seed(0)
+        q = mx.random.normal((B, H, qL, D))
+        k = mx.random.normal((B, H, kL, D))
+        v = mx.random.normal((B, H, kL, D))
+
+        # "causal" and "causal_lower_right" should be identical
+        out_causal = mx.fast.scaled_dot_product_attention(
+            q, k, v, scale=scale, mask="causal"
+        )
+        out_lr = mx.fast.scaled_dot_product_attention(
+            q, k, v, scale=scale, mask="causal_lower_right"
+        )
+        self.assertTrue(mx.allclose(out_causal, out_lr, atol=1e-6, rtol=1e-5))
+
+        # "causal_upper_left" should match a manual upper-left mask
+        q_idx = mx.arange(qL)
+        k_idx = mx.arange(kL)
+        ul_mask = q_idx[:, None] >= k_idx[None]
+        out_ul = mx.fast.scaled_dot_product_attention(
+            q, k, v, scale=scale, mask="causal_upper_left"
+        )
+        out_manual = mx.fast.scaled_dot_product_attention(
+            q, k, v, scale=scale, mask=ul_mask
+        )
+        self.assertTrue(mx.allclose(out_ul, out_manual, atol=1e-5, rtol=1e-4))
+
+        # upper-left != lower-right when qL != kL
+        self.assertFalse(mx.allclose(out_ul, out_lr, atol=1e-2, rtol=1e-2))
+
+        # when qL == kL, both should be identical
+        q_eq = mx.random.normal((B, H, kL, D))
+        out_lr_eq = mx.fast.scaled_dot_product_attention(
+            q_eq, k, v, scale=scale, mask="causal_lower_right"
+        )
+        out_ul_eq = mx.fast.scaled_dot_product_attention(
+            q_eq, k, v, scale=scale, mask="causal_upper_left"
+        )
+        self.assertTrue(mx.allclose(out_lr_eq, out_ul_eq, atol=1e-6, rtol=1e-5))
+
 
 if __name__ == "__main__":
     mlx_tests.MLXTestRunner(failfast=True)