hw-native-sys · Little-oil · Apr 21, 2026 · Apr 21, 2026 · Apr 21, 2026 · Apr 21, 2026
diff --git a/docs/en/dev/ir/05-operators.md b/docs/en/dev/ir/05-operators.md
@@ -219,7 +219,7 @@ UINT32 + INT32 → INT32 (signed precedence)
 **Location**: `src/ir/op/tensor_ops/`
 **Python API**: `from pypto.ir.op import tensor`
 
-**Operations:** `tensor.add/sub/mul/div` (element-wise with full N-D broadcasting), `tensor.set_validshape` (internal, update valid-shape metadata without data movement — compiler-generated only), `tensor.sort32` / `tensor.mrgsort_format1` / `tensor.mrgsort_format2` (sorting; tensor-level counterparts of `tile.sort32` / `tile.mrgsort` — converted to tile ops by `ConvertTensorToTileOps`), `tensor.gather` (per-dim indexing; MVP supports rank-2 inputs with `dim=-1` and lowers to a per-row `tile.gather` loop via `ConvertTensorToTileOps`), `tensor.gather_mask` (mask-pattern gather; tensor-level counterpart of `tile.gather_mask`, with optional same-bit-width `output_dtype`)
+**Operations:** `tensor.add/sub/mul/div` (element-wise with full N-D broadcasting), `tensor.set_validshape` (internal, update valid-shape metadata without data movement — compiler-generated only), `tensor.sort32` / `tensor.mrgsort_format1` / `tensor.mrgsort_format2` (sorting; tensor-level counterparts of `tile.sort32` / `tile.mrgsort` — converted to tile ops by `ConvertTensorToTileOps`), `tensor.gather` (per-dim indexing; MVP supports rank-2 inputs with `dim=-1` and lowers to a per-row `tile.gather` loop via `ConvertTensorToTileOps`), `tensor.gather_mask` (mask-pattern gather; tensor-level counterpart of `tile.gather_mask`, with optional same-bit-width `output_dtype`), `tensor.ci` / `tensor.arange` (contiguous integer sequence generation; lowers to `tile.ci`)
 
 **Example:**
 
@@ -258,6 +258,7 @@ with ib.function("tensor_example") as f:
 | - | `tile.reshape` | Reshape tile to new dimensions (element count must match) |
 | - | `tile.transpose` | Swap two axes of a tile |
 | - | `tile.set_validshape` | Update valid-shape metadata without data movement |
+| - | `tile.ci` | Generate contiguous integer sequence (start + k / start - k); dtype ∈ {INT16, INT32}; innermost dim != 1 |
 | **Reduction** | `tile.sum` | Reduction along axis (axis, keepdim) |
 
 **Data Flow:** `TensorType (DDR) → tile.load → TileType (Unified Buffer) → tile.{ops} → TileType → tile.store → TensorType (DDR)`

diff --git a/docs/zh-cn/dev/ir/05-operators.md b/docs/zh-cn/dev/ir/05-operators.md
@@ -216,7 +216,7 @@ UINT32 + INT32 → INT32 (signed precedence)
 **位置**：`src/ir/op/tensor_ops/`
 **Python API**：`from pypto.ir.op import tensor`
 
-**操作：** `tensor.add/sub/mul/div`（逐元素，支持完整 N 维广播），`tensor.set_validshape`（内部 API，更新 valid_shape 元数据，不搬移数据 — 仅供编译器生成代码使用），`tensor.sort32` / `tensor.mrgsort_format1` / `tensor.mrgsort_format2`（排序；分别对应 `tile.sort32` / `tile.mrgsort` 的 tensor 层接口，由 `ConvertTensorToTileOps` 转换为 tile 操作），`tensor.gather`（按维索引；MVP 仅支持 2D 输入 + `dim=-1`，由 `ConvertTensorToTileOps` 按行展开为 `tile.gather` 循环），`tensor.gather_mask`（掩码模式选择；对应 `tile.gather_mask`，支持可选同位宽 `output_dtype`）
+**操作：** `tensor.add/sub/mul/div`（逐元素，支持完整 N 维广播），`tensor.set_validshape`（内部 API，更新 valid_shape 元数据，不搬移数据 — 仅供编译器生成代码使用），`tensor.sort32` / `tensor.mrgsort_format1` / `tensor.mrgsort_format2`（排序；分别对应 `tile.sort32` / `tile.mrgsort` 的 tensor 层接口，由 `ConvertTensorToTileOps` 转换为 tile 操作），`tensor.gather`（按维索引；MVP 仅支持 2D 输入 + `dim=-1`，由 `ConvertTensorToTileOps` 按行展开为 `tile.gather` 循环），`tensor.gather_mask`（掩码模式选择；对应 `tile.gather_mask`，支持可选同位宽 `output_dtype`），`tensor.ci` / `tensor.arange`（生成连续整数序列，下层降到 `tile.ci`）
 
 **示例：**
 
@@ -255,6 +255,7 @@ with ib.function("tensor_example") as f:
 | - | `tile.reshape` | 重塑 tile 维度（元素总数须一致） |
 | - | `tile.transpose` | 交换 tile 的两个轴 |
 | - | `tile.set_validshape` | 更新 valid_shape 元数据，不搬移数据 |
+| - | `tile.ci` | 生成连续整数序列（升序 start+k 或降序 start-k）；dtype ∈ {INT16, INT32}；最内维 != 1 |
 | **规约** | `tile.sum` | 沿轴规约（axis, keepdim） |
 
 **数据流：** `TensorType (DDR) → tile.load → TileType (Unified Buffer) → tile.{ops} → TileType → tile.store → TensorType (DDR)`

diff --git a/python/pypto/ir/op/tensor_ops.py b/python/pypto/ir/op/tensor_ops.py
@@ -76,6 +76,45 @@ def full(
     return _ir_core.create_op_call("tensor.full", [shape_tuple, value_expr], kwargs, actual_span)
 
 
+def ci(
+    start: int | Expr,
+    shape: Sequence[int | Expr] | _ir_core.MakeTuple,
+    dtype: DataType = DataType.INT32,
+    descending: bool = False,
+    span: Span | None = None,
+) -> Call:
+    """Generate a contiguous integer sequence into a tensor (lowers to tile.ci).
+
+    Note:
+        Lowers to ``pto.tci`` which only populates the first row. Leading
+        dimensions must be 1 — prefer shapes of the form ``[1, N]``.
+
+    Args:
+        start: Starting integer (plain int or scalar Expr). Must match ``dtype``.
+        shape: Destination shape (leading dims must be 1, innermost dim != 1).
+        dtype: Destination dtype. One of {INT16, INT32}.
+        descending: If True, generate a descending sequence.
+        span: Optional source span for debugging (auto-captured if not provided).
+
+    Returns:
+        Call expression that returns a TensorType.
+    """
+    actual_span = _get_span_or_capture(span)
+    if isinstance(start, Expr):
+        if isinstance(start, ConstInt) and start.dtype != dtype:
+            start_expr = ConstInt(start.value, dtype, actual_span)
+        else:
+            start_expr = start
+    else:
+        start_expr = ConstInt(start, dtype, actual_span)
+    shape_tuple = _to_make_tuple(shape, actual_span)
+    kwargs: dict[str, Any] = {"dtype": dtype, "descending": descending}
+    return _ir_core.create_op_call("tensor.ci", [start_expr, shape_tuple], kwargs, actual_span)
+
+
+arange = ci
+
+
 def read(
     tensor: Expr, indices: Expr | list[int | Expr] | _ir_core.MakeTuple, span: Span | None = None
 ) -> Call:

diff --git a/python/pypto/ir/op/tile_ops.py b/python/pypto/ir/op/tile_ops.py
@@ -479,6 +479,50 @@ def full(
     return _ir_core.create_op_call("tile.full", [shape_tuple, value_expr], kwargs, actual_span)
 
 
+def ci(
+    start: int | Expr,
+    shape: Sequence[int | Expr] | _ir_core.MakeTuple,
+    dtype: DataType = DataType.INT32,
+    descending: bool = False,
+    span: Span | None = None,
+) -> Call:
+    """Generate a contiguous integer sequence into a tile (pto.tci).
+
+    For a column index ``k`` in the first row of the destination tile:
+    - Ascending: ``dst[0, k] = start + k``
+    - Descending: ``dst[0, k] = start - k``
+
+    Note:
+        ``pto.tci`` uses the destination's valid-column count as the sequence
+        length and does NOT populate additional rows. Leading dimensions must
+        be 1 — prefer shapes of the form ``[1, N]``.
+
+    Args:
+        start: Starting integer (plain int or a scalar Expr). Its dtype must match ``dtype``.
+        shape: Destination tile shape (static, leading dims must be 1, innermost dim != 1).
+        dtype: Destination dtype. Must be one of {INT16, INT32}.
+        descending: If True, generate a descending sequence.
+        span: Optional source span for debugging (auto-captured if not provided).
+
+    Returns:
+        Call expression that returns a TileType with the generated sequence.
+    """
+    actual_span = _get_span_or_capture(span)
+    if isinstance(start, Expr):
+        if isinstance(start, ConstInt) and start.dtype != dtype:
+            start_expr = ConstInt(start.value, dtype, actual_span)
+        else:
+            start_expr = start
+    else:
+        start_expr = ConstInt(start, dtype, actual_span)
+    shape_tuple = _to_make_tuple(shape, actual_span)
+    kwargs: dict[str, Any] = {"dtype": dtype, "descending": descending}
+    return _ir_core.create_op_call("tile.ci", [start_expr, shape_tuple], kwargs, actual_span)
+
+
+arange = ci
+
+
 def fillpad(tile: Expr, pad_value: PadValue = PadValue.zero, span: Span | None = None) -> Call:
     """Fill remaining tile elements with specified padding value.
 

diff --git a/python/pypto/language/op/tensor_ops.py b/python/pypto/language/op/tensor_ops.py
@@ -25,6 +25,8 @@
     "slice",
     "fillpad",
     "full",
+    "ci",
+    "arange",
     "matmul",
     "matmul_acc",
     "mul",
@@ -233,6 +235,33 @@ def full(shape: Sequence[IntLike], dtype: DataType, value: int | float) -> Tenso
     return Tensor(expr=call_expr)
 
 
+def ci(
+    start: int | Scalar,
+    shape: Sequence[IntLike],
+    dtype: DataType = DataType.INT32,
+    descending: bool = False,
+) -> Tensor:
+    """Generate a contiguous integer sequence into a tensor.
+
+    Equivalent to ``numpy.arange`` / ``torch.arange``. Lowers to ``tile.ci`` → ``pto.tci``.
+
+    Args:
+        start: Starting integer (plain int or Scalar). Must match ``dtype``.
+        shape: Destination tensor shape (innermost dim != 1).
+        dtype: Destination dtype. One of {INT16, INT32}. Defaults to INT32.
+        descending: If True, generate a descending sequence.
+
+    Returns:
+        Tensor wrapping the ci operation.
+    """
+    start_expr = start.unwrap() if isinstance(start, Scalar) else start
+    call_expr = _ir_ops.ci(start_expr, _normalize_intlike(shape), dtype=dtype, descending=descending)
+    return Tensor(expr=call_expr)
+
+
+arange = ci
+
+
 def matmul(
     lhs: Tensor,
     rhs: Tensor,

diff --git a/python/pypto/language/op/tile_ops.py b/python/pypto/language/op/tile_ops.py
@@ -32,6 +32,8 @@
     "concat",
     "move",
     "full",
+    "ci",
+    "arange",
     "fillpad",
     "fillpad_inplace",
     "get_block_idx",
@@ -413,6 +415,33 @@ def full(shape: list[int], dtype: DataType, value: int | float) -> Tile:
     return Tile(expr=call_expr)
 
 
+def ci(
+    start: int | Scalar,
+    shape: Sequence[int],
+    dtype: DataType = DataType.INT32,
+    descending: bool = False,
+) -> Tile:
+    """Generate a contiguous integer sequence into a tile.
+
+    Equivalent to ``numpy.arange``-style index generation. Maps to ``pto.tci``.
+
+    Args:
+        start: Starting integer (plain int or a Scalar). Must match ``dtype``.
+        shape: Shape of the destination tile (static, innermost dim != 1).
+        dtype: Destination dtype. One of {INT16, INT32}. Defaults to INT32.
+        descending: If True, generate a descending sequence.
+
+    Returns:
+        Tile wrapping the ci operation.
+    """
+    start_expr = start.unwrap() if isinstance(start, Scalar) else start
+    call_expr = _ir_ops.ci(start_expr, list(shape), dtype=dtype, descending=descending)
+    return Tile(expr=call_expr)
+
+
+arange = ci
+
+
 def fillpad(tile: Tile, pad_value: PadValue = PadValue.zero) -> Tile:
     """Fill remaining tile elements with specified padding value.
 

diff --git a/src/backend/common/pto_ops_common.cpp b/src/backend/common/pto_ops_common.cpp
@@ -456,13 +456,25 @@ static std::string MakeAssignCodegenPTO(const std::string& pto_op_name, const Ca
 static std::string MakeCiCodegenPTO(const std::string& pto_op_name, const CallPtr& op,
                                     codegen::CodegenBase& codegen_base) {
   auto& codegen = dynamic_cast<codegen::PTOCodegen&>(codegen_base);
-  CHECK(op->args_.size() == 1) << "Operation:[" << pto_op_name << "] requires 1 argument, but got "
-                               << op->args_.size();
+  CHECK(op->args_.size() == 2) << "Operation:[" << pto_op_name
+                               << "] requires 2 arguments (start, shape), but got " << op->args_.size();
   bool descending = op->GetKwarg<bool>("descending");
   std::string src = codegen.GetExprAsCode(op->args_[0]);
+  std::string src_type = codegen.GetExprTypeAnnotation(op->args_[0]);
   std::string config_attr = descending ? "{descending = true}" : "{descending = false}";
   std::string dst = codegen.GetCurrentResultTarget();
-  codegen.Emit(pto_op_name + " ins(" + src + " " + config_attr + ") outs(" + dst + ")");
+  std::string dst_type = codegen.GetCurrentResultTileBufTypeString();
+  std::ostringstream oss;
+  oss << pto_op_name << " ins(" << src << " " << config_attr;
+  if (!src_type.empty()) {
+    oss << " : " << src_type;
+  }
+  oss << ") outs(" << dst;
+  if (!dst_type.empty()) {
+    oss << " : " << dst_type;
+  }
+  oss << ")";
+  codegen.Emit(oss.str());
   return "";
 }
 

diff --git a/src/ir/op/tensor_ops/memory.cpp b/src/ir/op/tensor_ops/memory.cpp
@@ -393,6 +393,89 @@ REGISTER_OP("tensor.full")
       return DeduceTensorFullType(args, kwargs);
     });
 
+TypePtr DeduceTensorCiType(const std::vector<ExprPtr>& args,
+                           const std::vector<std::pair<std::string, std::any>>& kwargs) {
+  // tensor.ci signature: (start, shape) with attrs {dtype, descending}
+  CHECK(args.size() == 2) << "tensor.ci requires exactly 2 arguments (start, shape), but got " << args.size();
+
+  bool found_dtype = false;
+  DataType dtype;
+  for (const auto& [key, value] : kwargs) {
+    if (key == "dtype") {
+      dtype = AnyCast<DataType>(value, "kwarg key: dtype");
+      found_dtype = true;
+      break;
+    }
+  }
+  CHECK(found_dtype) << "tensor.ci requires 'dtype' kwarg";
+  CHECK(dtype == DataType::INT16 || dtype == DataType::INT32)
+      << "tensor.ci dtype must be one of {INT16, INT32}, but got " << dtype.ToString();
+
+  // First arg: start scalar; dtype must match destination dtype.
+  auto start_scalar_type = As<ScalarType>(args[0]->GetType());
+  CHECK(start_scalar_type) << "tensor.ci requires first argument 'start' to be a scalar, but got "
+                           << args[0]->GetType()->TypeName();
+  CHECK(start_scalar_type->dtype_ == dtype)
+      << "tensor.ci 'start' dtype (" << start_scalar_type->dtype_.ToString()
+      << ") must match destination dtype (" << dtype.ToString() << ")";
+
+  // Second arg: shape TupleType.
+  auto shape_tuple_type = As<TupleType>(args[1]->GetType());
+  CHECK(shape_tuple_type) << "tensor.ci requires shape to be TupleType, but got "
+                          << args[1]->GetType()->TypeName();
+
+  for (size_t i = 0; i < shape_tuple_type->types_.size(); ++i) {
+    auto scalar_type = As<ScalarType>(shape_tuple_type->types_[i]);
+    CHECK(scalar_type) << "tensor.ci shape element " << i << " must be ScalarType, but got "
+                       << shape_tuple_type->types_[i]->TypeName();
+    CHECK(scalar_type->dtype_.IsInt())
+        << "tensor.ci shape element " << i << " must have integer dtype, but got "
+        << scalar_type->dtype_.ToString();
+  }
+
+  std::vector<ExprPtr> shape;
+  shape.reserve(shape_tuple_type->types_.size());
+  if (auto make_tuple = As<MakeTuple>(args[1])) {
+    shape = make_tuple->elements_;
+  } else {
+    for (size_t i = 0; i < shape_tuple_type->types_.size(); ++i) {
+      shape.emplace_back(std::make_shared<TupleGetItemExpr>(args[1], static_cast<int>(i), args[1]->span_));
+    }
+  }
+  CHECK(!shape.empty()) << "tensor.ci requires non-empty shape";
+
+  // ISA constraint: innermost dim Cols != 1.
+  if (auto last_const = As<ConstInt>(shape.back())) {
+    CHECK(last_const->value_ != 1) << "tensor.ci requires the innermost dimension (Cols) to be != 1, got "
+                                   << last_const->value_;
+  }
+
+  // ISA constraint: pto.tci only populates the first row. Reject multi-row compile-time
+  // shapes so tensor.ci metadata stays consistent with the tile.ci lowering.
+  for (size_t i = 0; i + 1 < shape.size(); ++i) {
+    if (auto const_dim = As<ConstInt>(shape[i])) {
+      CHECK(const_dim->value_ == 1)
+          << "tensor.ci only populates the first row because pto.tci ignores valid rows; "
+          << "leading dimensions must be 1, but got " << const_dim->value_ << " at index " << i;
+    }
+  }
+
+  (void)kwargs;  // descending is optional bool kwarg, no validation needed beyond type.
+  return std::make_shared<TensorType>(shape, dtype);
+}
+
+REGISTER_OP("tensor.ci")
+    .set_op_category("TensorOp")
+    .set_description("Generate a contiguous integer sequence into a tensor (lowers to tile.ci)")
+    .add_argument("start", "Starting integer scalar (must match dst dtype)")
+    .add_argument("shape", "Destination shape (TupleType of ScalarType integer)")
+    .set_attr<DataType>("dtype")
+    .set_attr<bool>("descending")
+    .f_deduce_type([](const std::vector<ExprPtr>& args,
+                      const std::vector<std::pair<std::string, std::any>>& kwargs) {
+      return DeduceTensorCiType(args, kwargs);
+    });
+
 TypePtr DeduceTensorDimType(const std::vector<ExprPtr>& args,
                             const std::vector<std::pair<std::string, std::any>>& kwargs) {
   // tensor.dim: Extract a shape dimension from a tensor as a scalar