save work

charithaintc · charithaintc · commit eacc9d876840 · 2026-04-17T00:10:42.000Z
diff --git a/lighthouse/dialects/transform/transform_ext/ops/update_address_space.py b/lighthouse/dialects/transform/transform_ext/ops/update_address_space.py
@@ -32,51 +32,46 @@ def apply(
             state: transform.TransformState,
         ) -> DiagnosedSilenceableFailure:
             # Get the target operations to transform
-            target_ops = state.get_payload_ops(op.target)
+            target_op = state.get_payload_ops(op.target)[0]
             # Get the address space value from the attribute
             address_space_value = ir.IntegerAttr(op.address_space).value
             new_ops = []
 
-            for target_op in target_ops:
-                # Verify this is a memref.alloca operation
-                if target_op.OPERATION_NAME != "memref.alloca":
-                    return DiagnosedSilenceableFailure.emit_silenceable_error(
-                        f"Expected memref.alloca operation, got {target_op.OPERATION_NAME}"
-                    )
-
-                # Get the current result type (should be a MemRefType)
-                old_result_type = target_op.results[0].type
-                memref_type = ir.MemRefType(old_result_type)
-                # Create a new memref type with the specified address space
-                new_memref_type = ir.MemRefType.get(
-                    memref_type.shape,
-                    memref_type.element_type,
-                    layout=memref_type.layout,
-                    memory_space=ir.Attribute.parse(f"{address_space_value}"),
+            # Verify this is a memref.alloca operation
+            if target_op.OPERATION_NAME != "memref.alloca":
+                return DiagnosedSilenceableFailure.emit_silenceable_error(
+                    f"Expected memref.alloca operation, got {target_op.OPERATION_NAME}"
                 )
 
-                # Replace the operation with a new one that has the updated type
-                with ir.InsertionPoint(target_op):
-                    # Get the operands from the original alloca (dynamic sizes and symbols)
-                    dynamic_sizes = list(
-                        target_op.operands[
-                            : target_op.attributes["operandSegmentSizes"][0]
-                        ]
-                    )
-                    symbol_operands = list(
-                        target_op.operands[
-                            target_op.attributes["operandSegmentSizes"][0] :
-                        ]
-                    )
-                    # Create a new alloca with the updated type
-                    new_alloca = memref.alloca(
-                        new_memref_type, dynamic_sizes, symbol_operands
-                    )
-                    # Replace all uses of the old operation with the new one
-                    # FIXME: This won't handle operations that consume the memref type and
-                    # return a new memref (such as subview).
-                    rewriter.replace_op(target_op, [new_alloca])
-                    new_ops.append(new_alloca.owner)
+            # Get the current result type (should be a MemRefType)
+            old_result_type = target_op.results[0].type
+            memref_type = ir.MemRefType(old_result_type)
+            # Create a new memref type with the specified address space
+            new_memref_type = ir.MemRefType.get(
+                memref_type.shape,
+                memref_type.element_type,
+                layout=memref_type.layout,
+                memory_space=ir.Attribute.parse(f"{address_space_value}"),
+            )
+
+            # Replace the operation with a new one that has the updated type
+            with ir.InsertionPoint(target_op):
+                # Get the operands from the original alloca (dynamic sizes and symbols)
+                dynamic_sizes = list(
+                    target_op.operands[: target_op.attributes["operandSegmentSizes"][0]]
+                )
+                symbol_operands = list(
+                    target_op.operands[target_op.attributes["operandSegmentSizes"][0] :]
+                )
+                # Create a new alloca with the updated type
+                new_alloca = memref.alloca(
+                    new_memref_type, dynamic_sizes, symbol_operands
+                )
+                # Replace all uses of the old operation with the new one
+                # FIXME: This won't handle operations that consume the memref type and
+                # return a new memref (such as subview).
+                rewriter.replace_op(target_op, [new_alloca])
+                new_ops.append(new_alloca.owner)
 
             # Set the results to the new operations
             results.set_ops(op.updated_op, new_ops)
diff --git a/lighthouse/schedule/xegpu/softmax_schedule.py b/lighthouse/schedule/xegpu/softmax_schedule.py
@@ -16,6 +16,7 @@
     PipelineInterrupt,
 )
 from lighthouse.schedule.xegpu.helper import bundle_xegpu_to_binary
+from lighthouse.dialects.transform import transform_ext
 
 
 def get_softmax_schedule_module(
@@ -140,7 +141,6 @@ def bundle_xegpu_softmax_schedule(
         transform.AnyOpType.get(), func, ops=["linalg.softmax"]
     )
     structured.structured_decompose_interface(anytype, softmax_ops)
-    transform.print_(target=func, name="Aftemr structured_decompose_interface")
 
     linalg_ops = match_and_split(
         func, ops={"linalg.generic", "linalg.fill"}, nhandles=6
@@ -157,18 +157,13 @@ def bundle_xegpu_softmax_schedule(
         div_op, sizes=[0, reduction_step_size]
     ).results
 
-    transform.print_(target=func, name="After tiling div op")
-
     # Fuse max_center_and_exp_op into the div loop
     _, fused_loop = structured.structured_fuse_into_containing_op(
         anytype,
         anytype,
         producer_op=max_center_and_exp_op,
         containing_op=div_loop,
     )
-    transform.print_(
-        target=func, name="After fusing max_center_and_exp_op into div loop"
-    )
 
     # Tile the sum reduction and fuse the sub+exp producer into it
     _, _, _, sum_loop = structured.structured_tile_reduction_using_for(
@@ -180,8 +175,6 @@ def bundle_xegpu_softmax_schedule(
         tile_sizes=[0, reduction_step_size],
     )
 
-    transform.print_(target=func, name="After tiling sum reduction")
-
     func = transform.get_parent_op(
         anytype,
         fused_loop,
@@ -200,9 +193,6 @@ def bundle_xegpu_softmax_schedule(
         producer_op=max_center_and_exp_op,
         containing_op=sum_loop,
     )
-    transform.print_(
-        target=func, name="After fusing max_center_and_exp_op into sum loop"
-    )
 
     # Tile the max reduction.
     max_reduction = linalg_ops[0]
@@ -214,7 +204,6 @@ def bundle_xegpu_softmax_schedule(
         target=max_reduction,
         tile_sizes=[0, reduction_step_size],
     )
-    transform.print_(target=func, name="After tiling max reduction")
 
     # Cleanup after tiling and fusion
     transform.apply_cse(func)
@@ -231,8 +220,6 @@ def bundle_xegpu_softmax_schedule(
     transform.apply_cse(func)
     canonicalize(func)
 
-    transform.print_(target=func, name="After vectorization")
-
     if stop_at_stage == "vectorized":
         raise PipelineInterrupt()
 
@@ -250,8 +237,6 @@ def bundle_xegpu_softmax_schedule(
     transform.apply_cse(mod)
     canonicalize(mod)
 
-    transform.print_(target=mod, name="After bufferization")
-
     # promote memref.alloc to memref.alloca in payload function
     func = match(mod, ops={"func.func"})
     func = apply_registered_pass(
@@ -263,8 +248,6 @@ def bundle_xegpu_softmax_schedule(
         },
     )
 
-    transform.print_(target=func, name="After promoting buffers to stack")
-
     if stop_at_stage == "bufferized":
         raise PipelineInterrupt()
 
@@ -294,8 +277,6 @@ def bundle_xegpu_softmax_schedule(
     mod = apply_registered_pass(mod, "gpu-kernel-outlining")
     transform.apply_cse(mod)
 
-    transform.print_(target=mod, name="After GPU outlining")
-
     if stop_at_stage == "gpu-outlining":
         raise PipelineInterrupt()
 
@@ -306,12 +287,16 @@ def bundle_xegpu_softmax_schedule(
         options={"O": "3", "chip": "bmg"},
     )
 
-    # convert vector to xegpu
+    # for each gpu function in the gpu module, change memref.alloca address
+    # space to 3 (SLM) and convert vector to xegpu.
     gpu_mod_ops = match_and_split(mod, ops={"gpu.module"})
     for gpu_mod in gpu_mod_ops:
         gpu_func = match(gpu_mod, ops={"gpu.func"})
-        gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu")
-        transform.apply_cse(gpu_func)
+        allocas = match_and_split(gpu_func, ops={"memref.alloca"})
+        for alloca in allocas:
+            transform_ext.update_address_space(alloca, address_space=3)
+        # gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu")
+        # transform.apply_cse(gpu_func)
 
     # Cleanup.
     transform.apply_cse(mod)