hw-native-sys · Little-oil · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026 · gemini-code-assist
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -112,6 +112,7 @@ set(PYPTO_SOURCES
     src/ir/op/tensor_ops/elementwise.cpp
     src/ir/op/tensor_ops/matmul.cpp
     src/ir/op/tensor_ops/memory.cpp
+    src/ir/op/tensor_ops/scatter.cpp
     src/ir/op/tensor_ops/scatter_update.cpp
     src/ir/op/tensor_ops/reduction.cpp
     src/ir/op/tensor_ops/transform.cpp

diff --git a/docs/en/user/02-operation_reference.md b/docs/en/user/02-operation_reference.md
@@ -43,6 +43,7 @@ Operate on `Tensor` objects (DDR memory).
 | `transpose` | `(tensor: Tensor, axis1: int, axis2: int) -> Tensor` | Swap two axes |
 | `assemble` | `(target: Tensor, source: Tensor, offset: Sequence[IntLike]) -> Tensor` | Write source into target at offset |
 | `scatter_update` | `(input: Tensor, dim: int, index: Tensor, src: Tensor) -> Tensor` | Update rows of `input` at sparse positions given by `index` with values from `src`. `input`/`src`: 2D `[rows, d]` or 4D `[B, S, 1, d]`; `index`: 2D `[b, s]` integer. Only `dim=-2` is supported |
+| `scatter_` | `(input: Tensor, dim: int, index: Tensor, src: Tensor \| float \| int) -> Tensor` | Element-level scatter: write `src` values into `input` at positions given by `index` along `dim`. Follows PyTorch `scatter_` semantics. Supports arbitrary rank and any valid `dim` in `[-rank, rank)`. `src` can be a tensor or a scalar |
 | `add` | `(lhs: Tensor, rhs: Tensor \| int \| float \| Scalar) -> Tensor` | Element-wise add |
 | `sub` | `(lhs: Tensor, rhs: Tensor \| int \| float \| Scalar) -> Tensor` | Element-wise subtract |
 | `mul` | `(lhs: Tensor, rhs: Tensor \| int \| float \| Scalar) -> Tensor` | Element-wise multiply |

diff --git a/docs/zh-cn/user/02-operation_reference.md b/docs/zh-cn/user/02-operation_reference.md
@@ -40,6 +40,7 @@
 | `transpose` | `(tensor: Tensor, axis1: int, axis2: int) -> Tensor` | 交换两个轴 |
 | `assemble` | `(target: Tensor, source: Tensor, offset: Sequence[IntLike]) -> Tensor` | 将 source 写入 target 的指定偏移 |
 | `scatter_update` | `(input: Tensor, dim: int, index: Tensor, src: Tensor) -> Tensor` | 按 `index` 指定的稀疏行位置，将 `src` 的行数据写入 `input`。`input`/`src`：2D `[rows, d]` 或 4D `[B, S, 1, d]`；`index`：2D `[b, s]` 整型。当前仅支持 `dim=-2` |
+| `scatter_` | `(input: Tensor, dim: int, index: Tensor, src: Tensor \| float \| int) -> Tensor` | 逐元素散射：沿 `dim` 维度按 `index` 指定的位置将 `src` 写入 `input`。语义与 PyTorch `scatter_` 一致。支持任意维度和 `[-rank, rank)` 范围内的 `dim`。`src` 可以是张量或标量 |
 | `add` | `(lhs: Tensor, rhs: Tensor \| int \| float \| Scalar) -> Tensor` | 逐元素加法 |
 | `sub` | `(lhs: Tensor, rhs: Tensor \| int \| float \| Scalar) -> Tensor` | 逐元素减法 |
 | `mul` | `(lhs: Tensor, rhs: Tensor \| int \| float \| Scalar) -> Tensor` | 逐元素乘法 |

diff --git a/python/pypto/debug/torch_codegen.py b/python/pypto/debug/torch_codegen.py
@@ -210,6 +210,14 @@ def _handle_slice(a: list[str], _kw: dict[str, Any]) -> str:
 _OP_MAP: dict[str, OpHandler] = {}
 
 
+def _scatter_handler(a: list[str], kw: dict[str, str]) -> str:
+    dim = kw.get("dim", "0")
+    reduce = kw.get("reduce", None)
+    if reduce and reduce != "none":
+        return f"{a[0]}.scatter_({dim}, {a[1]}, {a[2]}, reduce='{reduce}')"
+    return f"{a[0]}.scatter_({dim}, {a[1]}, {a[2]})"
+
+
 def _register_ops() -> None:
     m = _OP_MAP
 
@@ -261,6 +269,9 @@ def _register_ops() -> None:
         # scatter_update
         m[f"{prefix}.scatter_update"] = lambda a, kw: f"{a[0]}.scatter_(-2, {a[1]}.expand_as({a[2]}), {a[2]})"
 
+        # scatter_ (element-level scatter)
+        m[f"{prefix}.scatter_"] = _scatter_handler
+
         # broadcast ops - torch broadcasting handles these naturally
         m[f"{prefix}.row_expand_add"] = _binop("+")
         m[f"{prefix}.row_expand_sub"] = _binop("-")

diff --git a/python/pypto/ir/op/tensor_ops.py b/python/pypto/ir/op/tensor_ops.py
@@ -940,3 +940,74 @@ def scatter_update(
     op_args: list[Expr] = [input, index, src]
     kwargs: dict[str, Any] = {"dim": dim_val}
     return _ir_core.create_op_call("tensor.scatter_update", op_args, kwargs, actual_span)
+
+
+def scatter_(
+    input: Expr,
+    *args: Expr | int | float,
+    dim: int | Expr | None = None,
+    index: Expr | None = None,
+    src: Expr | float | int | None = None,
+    reduce: str | None = None,
+    span: Span | None = None,
+) -> Call:
+    """Element-level scatter into tensor along a dimension.
+
+    For each position (i₀,…,iₙ) in index, sets:
+      input[i₀]…[i_{d-1}][ index[i₀…iₙ] ][i_{d+1}]…[iₙ] = src[i₀…iₙ]
+
+    Follows PyTorch ``torch.Tensor.scatter_`` semantics.
+
+    Accepts call forms:
+    - scatter_(input, dim, index, src)
+    - scatter_(input, dim, index, src=1.0)
+
+    Args:
+        input: Destination tensor (N-D).
+        dim: Dimension along which to scatter.
+        index: Index tensor (same rank as input, integer dtype).
+        src: Source tensor (same shape as index) or scalar value.
+        span: Optional source span for debugging (auto-captured if not provided).
+
+    Returns:
+        Call expression returning the updated input tensor.
+    """
+    if len(args) == 3 and dim is None and index is None and src is None:
+        dim, index, src = args
+    elif len(args) == 2 and dim is not None and index is None and src is None:
+        index, src = args
+    elif len(args) == 1 and dim is None and index is not None and src is not None:
+        dim = args[0]
+    elif len(args) != 0:
+        raise TypeError(
+            "scatter_ expects (input, dim, index, src), "
+            "(input, index, src, dim=...), or (input, dim, index=..., src=...)"
+        )
+
+    if dim is None or index is None or src is None:
+        raise TypeError("scatter_ requires input, dim, index, and src")
+
+    actual_span = _get_span_or_capture(span)
+    if isinstance(dim, ConstInt):
+        dim_val = int(dim.value)
+    elif isinstance(dim, int):
+        dim_val = dim
+    else:
+        raise TypeError(f"dim must be int or ConstInt, got {type(dim)}")
+
+    if not isinstance(index, Expr):
+        raise TypeError(f"index must be Expr, got {type(index)}")
+
+    # src can be Expr or scalar (int → ConstInt, float → ConstFloat)
+    if isinstance(src, int):
+        src = ConstInt(src, DataType.INT32, actual_span)
+    elif isinstance(src, float):
+        src = ConstFloat(src, DataType.FP32, actual_span)
+    elif not isinstance(src, Expr):
+        raise TypeError(f"src must be Expr or scalar, got {type(src)}")
+
+    op_args: list[Expr] = [input, index, src]
+    kwargs: dict[str, Any] = {"dim": dim_val}
+    if reduce is not None:
+        kwargs["reduce"] = reduce
+    return _ir_core.create_op_call("tensor.scatter_", op_args, kwargs, actual_span)
diff --git a/python/pypto/language/__init__.py b/python/pypto/language/__init__.py
@@ -85,7 +85,7 @@ def scalar_func(x: pl.Scalar[pl.FP32]) -> pl.Scalar[pl.FP32]:
     tpush_to_aic,
     tpush_to_aiv,
 )
-from .op.tensor_ops import assemble, create_tensor, dim, full, scatter_update
+from .op.tensor_ops import assemble, create_tensor, dim, full, scatter_, scatter_update
 from .op.tile_ops import (
     MemRefType,
     abs,
@@ -326,6 +326,7 @@ def scalar_func(x: pl.Scalar[pl.FP32]) -> pl.Scalar[pl.FP32]:
     "dim",
     "full",
     "scatter_update",
+    "scatter_",
     "FunctionType",
     "ForKind",
     "Level",

diff --git a/python/pypto/language/op/__init__.py b/python/pypto/language/op/__init__.py
@@ -27,7 +27,7 @@
 from . import tile_ops as tile
 
 # Promoted tensor-only ops (accessible as pl.create_tensor, etc.)
-from .tensor_ops import assemble, dim, scatter_update
+from .tensor_ops import assemble, dim, scatter_, scatter_update
 from .tensor_ops import create as create_tensor
 
 # Promoted tile-only ops (accessible as pl.load, etc.)
@@ -191,4 +191,5 @@
     "assemble",
     "dim",
     "scatter_update",
+    "scatter_",
 ]
diff --git a/python/pypto/language/op/tensor_ops.py b/python/pypto/language/op/tensor_ops.py
@@ -59,6 +59,7 @@
     "reshape",
     "transpose",
     "scatter_update",
+    "scatter_",
 ]
 
 from pypto.ir.op import tensor_ops as _ir_ops
@@ -779,3 +780,38 @@ def scatter_update(
     """
     call_expr = _ir_ops.scatter_update(input.unwrap(), dim, index.unwrap(), src.unwrap())
     return Tensor(expr=call_expr)
+
+
+def scatter_(
+    input: Tensor,
+    dim: int,
+    index: Tensor,
+    src: float | int | Tensor,
+    *,
+    reduce: str | None = None,
+) -> Tensor:
+    """Element-level scatter: write src values into input at positions given by index along dim.
+
+    For each element position (i0,...,in) in index, sets:
+        input[i0]...[i_{d-1}][index[i0...in]][i_{d+1}]...[in] = src[i0...in]
+
+    Supports arbitrary rank and any valid dim in [-rank, rank).
+    src can be a tensor (same shape as index) or a scalar value.
+
+    Args:
+        input: Destination tensor (N-D)
+        dim: Dimension along which to scatter
+        index: Index tensor (N-D, same rank as input) of integer dtype
+        src: Source tensor (same shape as index) or scalar value
+        reduce: Optional reduce mode ("add" or "multiply")
-        reduce: Optional reduce mode ("add" or "multiply")
+        reduce: Optional reduce mode ("none", "add", or "multiply")
-        reduce: Optional reduce mode ("add" or "multiply")
+        reduce: Optional reduce mode ("none", "add", or "multiply")
+
+    Returns:
+        Tensor wrapping the scatter_ operation
+    """
+    src_expr: float | int | Expr
+    if isinstance(src, (Tensor, Scalar)):
+        src_expr = src.unwrap()
+    else:
+        src_expr = src
+    call_expr = _ir_ops.scatter_(input.unwrap(), dim, index.unwrap(), src_expr, reduce=reduce)
+    return Tensor(expr=call_expr)
diff --git a/src/backend/common/pto_ops_common.cpp b/src/backend/common/pto_ops_common.cpp
@@ -717,22 +717,46 @@ static std::string MakeTileAllocCodegenPTO(const CallPtr& op, codegen::CodegenBa
   return "";  // No MLIR emission - pto.alloc_tile generated from MemRefs in TileTypes
 }
 
-// Compute a row-major flat offset string from a MakeTuple of indices and the shape of the container.
+// Compute a row-major flat offset from a MakeTuple of indices and the shape,
+// emitting proper arith.muli / arith.addi SSA operations.
+// Returns the SSA name of the final flat-offset value (index type).
 static std::string ComputeFlatOffsetPTO(const ir::MakeTuplePtr& indices_tuple,
                                         const std::vector<ir::ExprPtr>& shape, codegen::PTOCodegen& codegen) {
   const auto& indices = indices_tuple->elements_;
   INTERNAL_CHECK(indices.size() == shape.size())
       << "Index count (" << indices.size() << ") must match shape rank (" << shape.size() << ")";
 
-  std::ostringstream idx_oss;
+  // Helper: ensure an index element SSA value has `index` type.
+  // If the expression is a non-index integer (e.g. i32 from tile.read on an
+  // INT32 tile), emit arith.index_cast to convert it.
+  auto ensure_index = [&](const ir::ExprPtr& expr, const std::string& ssa) -> std::string {
+    if (auto var = ir::As<ir::Var>(expr)) {
+      return codegen.EmitCastToIndex(var, ssa);
+    }
+    return ssa;
+  };
+
+  // For each dimension i, compute: index[i] * (shape[i+1] * shape[i+2] * ... * shape[rank-1])
+  // then sum all terms with arith.addi.
+  std::string accumulator;
   for (size_t i = 0; i < indices.size(); ++i) {
-    if (i > 0) idx_oss << " + ";
-    idx_oss << codegen.GetExprAsCode(indices[i]);
+    std::string term = ensure_index(indices[i], codegen.GetExprAsCode(indices[i]));
+    // Multiply by each trailing dimension size
     for (size_t j = i + 1; j < shape.size(); ++j) {
-      idx_oss << " * " << codegen.GetExprAsCode(shape[j]);
+      std::string dim = codegen.GetExprAsCode(shape[j]);
+      std::string tmp = codegen.NewTemp();
+      codegen.Emit(tmp + " = arith.muli " + term + ", " + dim + " : index");
+      term = tmp;
+    }
+    if (accumulator.empty()) {
+      accumulator = term;
+    } else {
+      std::string tmp = codegen.NewTemp();
+      codegen.Emit(tmp + " = arith.addi " + accumulator + ", " + term + " : index");
+      accumulator = tmp;
     }
   }
-  return idx_oss.str();
+  return accumulator;
 }
 
 // Get or emit a flat offset SSA value for a MakeTuple of indices and shape.
@@ -932,6 +956,15 @@ static std::string MakeTensorDimCodegenPTO(const CallPtr& op, codegen::CodegenBa
   return "";
 }
 
+static std::string MakeSystemBarrierCodegenPTO(const std::string& pipe_name, const CallPtr& op,
+                                               codegen::CodegenBase& codegen_base) {
+  CHECK(op->args_.empty()) << "system.barrier_" << pipe_name << " expects 0 arguments, got "
+                           << op->args_.size();
+  auto& codegen = dynamic_cast<codegen::PTOCodegen&>(codegen_base);
+  codegen.Emit("pto.barrier #pto.pipe<" + pipe_name + ">");
+  return "";
+}
+
 // ============================================================================
 // Cross-Core Communication Operations (TPUSH/TPOP)
 // ============================================================================
@@ -1334,6 +1367,15 @@ void RegisterPTOOps(Backend& backend, const std::unordered_set<std::string>& exc
   reg("tile.cast", [](const ir::CallPtr& op, codegen::CodegenBase& codegen) {
     return MakeTileCvtCodegenPTO("pto.tcvt", op, codegen);
   });
+  reg("system.bar_v", [](const ir::CallPtr& op, codegen::CodegenBase& codegen) {
+    return MakeSystemBarrierCodegenPTO("PIPE_V", op, codegen);
+  });
+  reg("system.bar_m", [](const ir::CallPtr& op, codegen::CodegenBase& codegen) {
+    return MakeSystemBarrierCodegenPTO("PIPE_M", op, codegen);
+  });
+  reg("system.bar_all", [](const ir::CallPtr& op, codegen::CodegenBase& codegen) {
+    return MakeSystemBarrierCodegenPTO("PIPE_ALL", op, codegen);
+  });
   // tile.full (TEXPANDS): output is row_major per ISA
   if (exclude_ops.count("tile.full") == 0) {
     backend.RegisterOp("tile.full")

diff --git a/src/codegen/pto/pto_codegen.cpp b/src/codegen/pto/pto_codegen.cpp
@@ -907,6 +907,10 @@ std::string PTOCodegen::GetExprAsCode(const ExprPtr& expr) {
     return GetVarName(var);
   }
   if (auto const_int = As<ir::ConstInt>(expr)) {
+    DataType dtype = const_int->dtype();
+    if (dtype == DataType::INT32) {
+      return GetOrEmitI32Constant(static_cast<int32_t>(const_int->value_));
+    }
     return GetIndexConstant(const_int->value_);
   }
   if (auto const_float = As<ir::ConstFloat>(expr)) {
@@ -1148,7 +1152,7 @@ std::string PTOCodegen::GetExprTypeAnnotation(const ir::ExprPtr& expr) {
     return "f32";
   }
   if (auto const_int = As<ir::ConstInt>(expr)) {
-    return "index";
+    return GetTypeString(const_int->dtype());
   }
   return "";
 }

diff --git a/src/codegen/pto/pto_scalar_expr_codegen.cpp b/src/codegen/pto/pto_scalar_expr_codegen.cpp
@@ -171,7 +171,17 @@ void PTOCodegen::VisitExpr_(const ir::IterArgPtr& op) {
 }
 
 void PTOCodegen::VisitExpr_(const ir::ConstIntPtr& op) {
-  fs_.current_expr_value = GetOrEmitIndexConstant(op->value_);
+  DataType dtype = op->dtype();
+  if (dtype == DataType::INDEX) {
+    fs_.current_expr_value = GetOrEmitIndexConstant(op->value_);
+  } else if (dtype == DataType::INT32) {
+    fs_.current_expr_value = GetOrEmitI32Constant(static_cast<int32_t>(op->value_));
+  } else {
+    std::string result = NewTemp();
+    std::string type_str = GetTypeString(dtype);
+    Emit(result + " = arith.constant " + std::to_string(op->value_) + " : " + type_str);
+    fs_.current_expr_value = result;
+  }
 }
 
 void PTOCodegen::VisitExpr_(const ir::ConstFloatPtr& op) {