Use TritonCompilerParams

ae-foster · ae-foster · commit 2b2e45b41b54 · 2025-05-27T11:05:29.000Z
diff --git a/folx/experimental/pallas/attention/custom_gradients.py b/folx/experimental/pallas/attention/custom_gradients.py
@@ -5,6 +5,7 @@
 import jax
 import jax.numpy as jnp
 from jax.experimental import pallas as pl
+from jax.experimental.pallas import gpu as plgpu
 
 from .mhsa import mhsa_kernel, reference_mhsa_kernel
 from .mhsea import mhsea_kernel, reference_mhsea_kernel
@@ -53,8 +54,8 @@ def mhsa_forward(
             out_shape=jax.ShapeDtypeStruct(
                 shape=(batch_len, seq_len, num_heads, head_len), dtype=q.dtype
             ),
-            compiler_params=dict(
-                triton=dict(num_warps=num_warps, num_stages=num_stages)
+            compiler_params=plgpu.TritonCompilerParams(
+                num_warps=num_warps, num_stages=num_stages
             ),
             debug=False,
             interpret=interpret,
@@ -113,8 +114,8 @@ def mhsa_backward(
                     shape=(batch_len, seq_len, num_heads, head_len), dtype=q.dtype
                 ),
             ],
-            compiler_params=dict(
-                triton=dict(num_warps=num_warps, num_stages=num_stages)
+            compiler_params=plgpu.TritonCompilerParams(
+                num_warps=num_warps, num_stages=num_stages
             ),
             debug=False,
             interpret=interpret,
@@ -268,8 +269,8 @@ def mhsea_forward(
                     shape=(batch_len, seq_len, num_heads), dtype=v.dtype
                 ),
             ],
-            compiler_params=dict(
-                triton=dict(num_warps=num_warps, num_stages=num_stages)
+            compiler_params=plgpu.TritonCompilerParams(
+                num_warps=num_warps, num_stages=num_stages
             ),
             debug=False,
             interpret=interpret,
@@ -372,8 +373,8 @@ def mhsea_backward(
                     shape=(batch_len, seq_len, num_heads, seq_len), dtype=e.dtype
                 ),
             ],
-            compiler_params=dict(
-                triton=dict(num_warps=num_warps, num_stages=num_stages)
+            compiler_params=plgpu.TritonCompilerParams(
+                num_warps=num_warps, num_stages=num_stages
             ),
             debug=False,
             interpret=interpret,
@@ -433,8 +434,8 @@ def mhsea_backward(
                     shape=(batch_len, seq_len, num_heads, head_len), dtype=v.dtype
                 ),
             ],
-            compiler_params=dict(
-                triton=dict(num_warps=num_warps, num_stages=num_stages)
+            compiler_params=plgpu.TritonCompilerParams(
+                num_warps=num_warps, num_stages=num_stages
             ),
             debug=False,
             interpret=interpret,
diff --git a/folx/experimental/pallas/attention/forward_laplacian.py b/folx/experimental/pallas/attention/forward_laplacian.py
@@ -5,6 +5,7 @@
 import jax
 import jax.numpy as jnp
 from jax.experimental import pallas as pl
+from jax.experimental.pallas import gpu as plgpu
 
 from folx import forward_laplacian
 from folx.api import FwdJacobian, FwdLaplArray
@@ -153,8 +154,8 @@ def mhsa_forward_laplacian(
                     dtype=q.dtype,  # o.laplacian
                 ),
             ],
-            compiler_params=dict(
-                triton=dict(num_warps=num_warps, num_stages=num_stages)
+            compiler_params=plgpu.TritonCompilerParams(
+                num_warps=num_warps, num_stages=num_stages
             ),
             debug=False,
             interpret=interpret,
@@ -588,8 +589,8 @@ def mhsea_forward_laplacian(
                     dtype=v.dtype,  # o.laplacian
                 ),
             ],
-            compiler_params=dict(
-                triton=dict(num_warps=num_warps, num_stages=num_stages)
+            compiler_params=plgpu.TritonCompilerParams(
+                num_warps=num_warps, num_stages=num_stages
             ),
             debug=False,
             interpret=interpret,
diff --git a/folx/experimental/pallas/attention/mhsa.py b/folx/experimental/pallas/attention/mhsa.py
@@ -5,6 +5,7 @@
 import jax
 import jax.numpy as jnp
 from jax.experimental import pallas as pl
+from jax.experimental.pallas import gpu as plgpu
 
 from .utils import (
     big_number,
@@ -58,8 +59,8 @@ def mhsa(
             out_shape=jax.ShapeDtypeStruct(
                 shape=(batch_len, seq_len, num_heads, head_len), dtype=q.dtype
             ),
-            compiler_params=dict(
-                triton=dict(num_warps=num_warps, num_stages=num_stages)
+            compiler_params=plgpu.TritonCompilerParams(
+                num_warps=num_warps, num_stages=num_stages
             ),
             debug=False,
             interpret=interpret,
diff --git a/folx/experimental/pallas/attention/mhsea.py b/folx/experimental/pallas/attention/mhsea.py
@@ -5,6 +5,7 @@
 import jax
 import jax.numpy as jnp
 from jax.experimental import pallas as pl
+from jax.experimental.pallas import gpu as plgpu
 
 from .utils import (
     big_number,
@@ -58,8 +59,8 @@ def mhsea(
                     shape=(batch_len, seq_len, num_heads), dtype=q.dtype
                 ),  # lse
             ],
-            compiler_params=dict(
-                triton=dict(num_warps=num_warps, num_stages=num_stages)
+            compiler_params=plgpu.TritonCompilerParams(
+                num_warps=num_warps, num_stages=num_stages
             ),
             debug=False,
             interpret=interpret,