feat(ir): add tile.mscatter op for per-element scatter-store to GM (#921)

Youhezhen · Youhezhen · commit 9a8577a19fe8 · 2026-04-15T09:54:27.000+08:00
Add tile.mscatter operation mapping to pto.mscatter instruction:
  mem[idx[i, j]] = src[i, j]

- C++ op registration with type deduction and validation
- PTO codegen emitting partition_view + pto.mscatter
- Python IR and DSL wrappers with pl.mscatter export
- Unit tests covering basic usage and error paths
- ST runtime tests (skipped: PTOAS lacks NPU mscatter impl)
diff --git a/python/pypto/ir/op/tile_ops.py b/python/pypto/ir/op/tile_ops.py
@@ -325,6 +325,31 @@ def scatter_update(
     return _ir_core.create_op_call("tile.scatter_update", op_args, kwargs, actual_span)
 
 
+def mscatter(
+    src: Expr,
+    idx: Expr,
+    output_tensor: Expr,
+    span: Span | None = None,
+) -> Call:
+    """Scatter-store elements from src tile to output_tensor at per-element indices.
+
+    Semantics: ``output_tensor[idx[i, j]] = src[i, j]``
+
+    Maps to the PTOAS ``pto.mscatter`` instruction.
+
+    Args:
+        src: Source tile (FP16, FP32, INT16, or INT32)
+        idx: Index tile (INT32, same rank as src)
+        output_tensor: Output tensor (TensorType, same dtype as src)
+        span: Optional source span for debugging (auto-captured if not provided)
+
+    Returns:
+        Call expression that returns the output tensor
+    """
+    actual_span = _get_span_or_capture(span)
+    return _ir_core.create_op_call("tile.mscatter", [src, idx, output_tensor], {}, actual_span)
+
+
 def concat(
     src0: Expr,
     src1: Expr,
diff --git a/python/pypto/language/__init__.py b/python/pypto/language/__init__.py
@@ -133,6 +133,9 @@ def scalar_func(x: pl.Scalar[pl.FP32]) -> pl.Scalar[pl.FP32]:
     xor,
     xors,
 )
+from .op.tile_ops import (
+    mscatter as mscatter,
+)
 from .op.unified_ops import (
     add,
     cast,
diff --git a/python/pypto/language/op/__init__.py b/python/pypto/language/op/__init__.py
@@ -52,6 +52,7 @@
     minimum,
     mins,
     move,
+    mscatter,
     not_,
     or_,
     ors,
@@ -177,6 +178,7 @@
     "shrs",
     "maxs",
     "mins",
+    "mscatter",
     "prelu",
     "not_",
     "addc",
diff --git a/python/pypto/language/op/tile_ops.py b/python/pypto/language/op/tile_ops.py
@@ -112,6 +112,7 @@
     "tpop_from_aiv",
     "sort32",
     "gather",
+    "mscatter",
     "MaskPattern",
     "mrgsort",
 ]
@@ -1687,6 +1688,28 @@ def gather(
     return Tile(expr=call_expr)
 
 
+def mscatter(src: Tile, idx: Tile, output_tensor: Tensor) -> Tensor:
+    """Scatter-store tile elements into a tensor at per-element indices.
+
+    Semantics: ``output_tensor[idx[i, j]] = src[i, j]``
+
+    Maps to the PTOAS ``pto.mscatter`` instruction.
+
+    Args:
+        src: Source tile (FP16, FP32, INT16, or INT32)
+        idx: Index tile (INT32, same rank as src)
+        output_tensor: Output tensor to scatter into (same dtype as src)
+
+    Returns:
+        Tensor wrapping the mscatter operation
+
+    Example:
+        >>> result = pl.tile.mscatter(src_tile, idx_tile, out_tensor)
+    """
+    call_expr = _ir_ops.mscatter(src.unwrap(), idx.unwrap(), output_tensor.unwrap())
+    return Tensor(expr=call_expr)
+
+
 @overload
 def mrgsort(src0: Tile, *, block_len: int | Scalar) -> Tile: ...
 
diff --git a/src/backend/common/pto_ops_common.cpp b/src/backend/common/pto_ops_common.cpp
@@ -713,6 +713,84 @@ static std::string MakeTileStoreCodegenPTO(const CallPtr& op, codegen::CodegenBa
   return "";
 }
 
+// tile.mscatter(src, idx, output_tensor) -> pto.mscatter
+// Generates:
+//   %pview = pto.partition_view %tensor_view, offsets=[0,...], sizes=[d0,...] : ... -> ...
+//   pto.mscatter ins(%src, %idx : !pto.tile_buf<...>, !pto.tile_buf<...>)
+//                outs(%pview : !pto.partition_tensor_view<...>)
+static std::string MakeTileMscatterCodegenPTO(const CallPtr& op, codegen::CodegenBase& codegen_base) {
+  auto& codegen = dynamic_cast<codegen::PTOCodegen&>(codegen_base);
+  INTERNAL_CHECK(op->args_.size() == 3)
+      << "tile.mscatter requires 3 arguments (src, idx, output_tensor), got " << op->args_.size();
+
+  auto src = AsVarLike(op->args_[0]);
+  INTERNAL_CHECK(src) << "tile.mscatter src must be a Var or IterArg";
+  auto idx = AsVarLike(op->args_[1]);
+  INTERNAL_CHECK(idx) << "tile.mscatter idx must be a Var or IterArg";
+  auto output_tensor = AsVarLike(op->args_[2]);
+  INTERNAL_CHECK(output_tensor) << "tile.mscatter output_tensor must be a Var or IterArg";
+
+  auto tensor_type = As<TensorType>(output_tensor->GetType());
+  INTERNAL_CHECK(tensor_type) << "tile.mscatter output_tensor must have TensorType";
+
+  std::string src_name = codegen.GetVarName(src);
+  std::string idx_name = codegen.GetVarName(idx);
+  std::string src_type_annot = codegen.GetExprTypeAnnotation(op->args_[0]);
+  std::string idx_type_annot = codegen.GetExprTypeAnnotation(op->args_[1]);
+
+  std::string dtype_str = codegen.GetTypeString(tensor_type->dtype_);
+  std::string tensor_view = codegen.GetOrCreateTensorView(output_tensor);
+  std::string tensor_view_type = codegen.GetTensorViewTypeString(tensor_type.get());
+
+  // Build pto.partition_view covering the entire tensor (mscatter uses per-element
+  // indices, so the partition is the whole tensor — offsets all zero, sizes = shape).
+  std::string partition_view = codegen.NewNamedTemp(output_tensor->name_hint_ + "_pview");
+  std::ostringstream partition_line;
+  partition_line << partition_view << " = pto.partition_view " << tensor_view;
+  partition_line << ", offsets = [";
+  for (size_t i = 0; i < tensor_type->shape_.size(); ++i) {
+    if (i > 0) partition_line << ", ";
+    partition_line << codegen.GetIndexConstant(0);
+  }
+  partition_line << "], sizes = [";
+  std::string partition_type = "!pto.partition_tensor_view<";
+  for (size_t i = 0; i < tensor_type->shape_.size(); ++i) {
+    if (i > 0) {
+      partition_line << ", ";
+      partition_type += "x";
+    }
+    if (auto c = As<ir::ConstInt>(tensor_type->shape_[i])) {
+      partition_line << codegen.GetIndexConstant(c->value_);
+      partition_type += std::to_string(c->value_);
+    } else {
+      partition_line << codegen.GetExprAsCode(tensor_type->shape_[i]);
+      partition_type += "?";
+    }
+  }
+  partition_line << "]";
+  partition_type += "x" + dtype_str + ">";
+  partition_line << " : " << tensor_view_type << " -> " << partition_type;
+  codegen.Emit(partition_line.str());
+
+  // Emit pto.mscatter with partition_view in outs()
+  std::ostringstream mscatter_line;
+  mscatter_line << "pto.mscatter ins(" << src_name << ", " << idx_name;
+  if (!src_type_annot.empty() && !idx_type_annot.empty()) {
+    mscatter_line << " : " << src_type_annot << ", " << idx_type_annot;
+  }
+  mscatter_line << ") outs(" << partition_view << " : " << partition_type << ")";
+  codegen.Emit(mscatter_line.str());
+
+  // Propagate tensor_view to the result var so downstream ops see the updated tensor
+  auto result_var = codegen.GetCurrentResultVar();
+  if (result_var != nullptr) {
+    codegen.RegisterTensorView(result_var, tensor_view);
+    codegen.RegisterVarToMlir(result_var, tensor_view);
+  }
+
+  return "";
+}
+
 // Helper function for tile.alloc (no-op: allocation handled elsewhere)
 static std::string MakeTileAllocCodegenPTO(const CallPtr& op, codegen::CodegenBase& codegen_base) {
   (void)op;
@@ -1171,7 +1249,6 @@ struct SimpleOpEntry {
 static const SimpleOpEntry kSimpleOps[] = {
     // Memory operations
     {"tile.mgather",         "pto.tmgather",         2},
-    {"tile.mscatter",        "pto.tmscatter",        2},
     // Tile x Tile arithmetic operations
     {"tile.add",             "pto.tadd",             2},
     {"tile.sub",             "pto.tsub",             2},
@@ -1321,6 +1398,15 @@ void RegisterPTOOps(Backend& backend, const std::unordered_set<std::string>& exc
   reg("tile.store", [](const ir::CallPtr& op, codegen::CodegenBase& codegen) {
     return MakeTileStoreCodegenPTO(op, codegen);
   });
+  // tile.mscatter: src and idx must be row_major (MTE3 DMA reads UB linearly)
+  if (exclude_ops.count("tile.mscatter") == 0) {
+    backend.RegisterOp("tile.mscatter")
+        .f_codegen([](const ir::CallPtr& op, codegen::CodegenBase& codegen) {
+          return MakeTileMscatterCodegenPTO(op, codegen);
+        })
+        .set_input_layout(0, ir::TileLayout::row_major)
+        .set_input_layout(1, ir::TileLayout::row_major);
+  }
   reg("tile.alloc", [](const ir::CallPtr& op, codegen::CodegenBase& codegen) {
     return MakeTileAllocCodegenPTO(op, codegen);
   });
diff --git a/src/ir/op/tile_ops/memory.cpp b/src/ir/op/tile_ops/memory.cpp
@@ -535,6 +535,65 @@ REGISTER_OP("tile.store")
       return DeduceTileStoreType(args, kwargs, "tile.store");
     });
 
+// ============================================================================
+// tile.mscatter: scatter-store tile elements to tensor via per-element indices
+// Maps to pto.mscatter: mem[idx[i, j]] = src[i, j]
+// ============================================================================
+
+TypePtr DeduceTileMscatterType(const std::vector<ExprPtr>& args,
+                               const std::vector<std::pair<std::string, std::any>>& kwargs,
+                               const std::string& op_name) {
+  CHECK(args.size() == 3) << "The operator " << op_name
+                          << " requires 3 arguments (src, idx, output_tensor), but got " << args.size();
+
+  // First arg: src tile (FP16/FP32/INT16/INT32)
+  auto src_type = As<TileType>(args[0]->GetType());
+  CHECK(src_type) << "The operator " << op_name << " requires first argument to be a TileType, but got "
+                  << args[0]->GetType()->TypeName();
+  CHECK(src_type->dtype_ == DataType::FP16 || src_type->dtype_ == DataType::FP32 ||
+        src_type->dtype_ == DataType::INT16 || src_type->dtype_ == DataType::INT32)
+      << "The operator " << op_name << " requires src dtype to be FP16, FP32, INT16, or INT32, but got "
+      << src_type->dtype_.ToString();
+
+  // Second arg: idx tile (INT32, same rank as src)
+  auto idx_type = As<TileType>(args[1]->GetType());
+  CHECK(idx_type) << "The operator " << op_name << " requires second argument to be a TileType, but got "
+                  << args[1]->GetType()->TypeName();
+  CHECK(idx_type->dtype_ == DataType::INT32)
+      << "The operator " << op_name << " requires idx dtype to be INT32, but got "
+      << idx_type->dtype_.ToString();
+  CHECK(idx_type->shape_.size() == src_type->shape_.size())
+      << "The operator " << op_name << " requires idx rank to match src rank (" << src_type->shape_.size()
+      << "), but got " << idx_type->shape_.size();
+
+  // Third arg: output tensor (same dtype as src)
+  auto tensor_type = As<TensorType>(args[2]->GetType());
+  CHECK(tensor_type) << "The operator " << op_name << " requires third argument to be a TensorType, but got "
+                     << args[2]->GetType()->TypeName();
+  CHECK(tensor_type->dtype_ == src_type->dtype_)
+      << "The operator " << op_name << " requires output_tensor dtype (" << tensor_type->dtype_.ToString()
+      << ") to match src dtype (" << src_type->dtype_.ToString() << ")";
+
+  // mscatter returns the output tensor (same type)
+  return tensor_type;
+}
+
+REGISTER_OP("tile.mscatter")
+    .set_op_category("TileOp")
+    .set_description(
+        "Scatter-store elements from src tile to tensor at per-element indices "
+        "(maps to pto.mscatter)")
+    .add_argument("src", "Source tile (FP16, FP32, INT16, or INT32)")
+    .add_argument("idx", "Index tile (INT32, same rank as src)")
+    .add_argument("output_tensor", "Output tensor (TensorType, same dtype as src)")
+    .set_input_memory(0, MemorySpace::Vec)
+    .set_input_memory(1, MemorySpace::Vec)
+    .set_output_reuses_input(2)
+    .f_deduce_type([](const std::vector<ExprPtr>& args,
+                      const std::vector<std::pair<std::string, std::any>>& kwargs) {
+      return DeduceTileMscatterType(args, kwargs, "tile.mscatter");
+    });
+
 REGISTER_OP("tile.move")
     .set_op_category("TileOp")
     .set_description("Move tile between memory levels (Vec/Mat/Left/Right)")
diff --git a/tests/st/runtime/test_mscatter.py b/tests/st/runtime/test_mscatter.py
diff --git a/tests/ut/ir/operators/test_tile_ops.py b/tests/ut/ir/operators/test_tile_ops.py