fix(attention): added not supported assertion for inter document masking with pytorch sdpa

BlueCrescent · BlueCrescent · commit 478628d5f134 · 2026-02-28T14:21:29.000+01:00
diff --git a/src/modalities/models/gpt2/gpt2_model.py b/src/modalities/models/gpt2/gpt2_model.py
@@ -825,6 +825,9 @@ def execute_attention(
             )  # (B, nh_q, T, hd)
             y = y.transpose(1, 2).contiguous()  # (B, T, nh_q, hd)
         elif attention_impl == AttentionImplementation.PYTORCH_FLASH:
+            assert (
+                attention_masking_information is None
+            ), "Inter-document masking is not supported for PyTorch Flash Attention."
             k, v = cls.repeat_kv_heads(q, k, v)  # for GQA (group query attention)
             y = torch.nn.functional.scaled_dot_product_attention(
                 query=q,