diff --git a/.claude/rules/pass-doc-ordering.md b/.claude/rules/pass-doc-ordering.md index 48ac9c92d..89d39049f 100644 --- a/.claude/rules/pass-doc-ordering.md +++ b/.claude/rules/pass-doc-ordering.md @@ -17,19 +17,18 @@ Developers read pass docs sequentially to understand the compilation pipeline. I | 02 | `02-ctrl_flow_transform.md` | 2nd pass | | 03 | `03-convert_to_ssa.md` | 3rd pass | | 04 | `04-flatten_call_expr.md` | 4th pass | -| 05 | `05-split_chunked_loops.md` | 5th pass | -| 06 | `06-interchange_chunk_loops.md` | 6th pass | -| 07 | `07-outline_incore_scopes.md` | 7th pass | -| 08 | `08-outline_cluster_scopes.md` | 8th pass | -| 09 | `09-convert_tensor_to_tile_ops.md` | 9th pass | -| 10 | `10-optimize_orch_tensors.md` | 10th pass | -| 11 | `11-flatten_tile_nd_to_2d.md` | 11th pass | -| 12 | *(no doc yet)* | 12th pass (`InferTileMemorySpace`) | -| 13 | *(no doc yet)* | 13th pass (`ResolveTransposeLayout`) | -| 14 | `14-expand_mixed_kernel.md` | 14th pass | -| 15 | `15-init_memref.md` | 15th pass | -| 16 | `16-memory_reuse.md` | 16th pass | -| 17 | `17-allocate_memory_addr.md` | 17th pass | +| 05 | `05-outline_hierarchy_scopes.md` | 5th pass (non-CORE_GROUP → `Opaque`) | +| 06 | `06-outline_incore_scopes.md` | 6th pass (CORE_GROUP → `InCore`, promote parent) | +| 07 | `07-outline_cluster_scopes.md` | 7th pass | +| 08 | `08-convert_tensor_to_tile_ops.md` | 8th pass | +| 09 | `09-optimize_orch_tensors.md` | 9th pass | +| 10 | `10-flatten_tile_nd_to_2d.md` | 10th pass | +| 11 | `11-expand_mixed_kernel.md` | 11th pass | +| 12 | `12-init_memref.md` | 12th pass | +| 13 | `13-memory_reuse.md` | 13th pass | +| 14 | `14-allocate_memory_addr.md` | 14th pass | +| 15 | `15-partial_unroll_tile_loops.md` | 15th pass | +| 16 | `16-reorder_unrolled_io.md` | 16th pass | | 90 | `90-insert_sync.md` | Not in Default strategy | | 91 | `91-utility_passes.md` | Not in Default strategy | | 99 | `99-verifier.md` | Infrastructure (not a pipeline pass) | diff --git a/CMakeLists.txt b/CMakeLists.txt index b92a4a01f..27582902b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -147,9 +147,9 @@ set(PYPTO_SOURCES src/ir/transforms/mutator.cpp src/ir/transforms/normalize_stmt_structure_pass.cpp src/ir/transforms/op_conversion_registry.cpp - src/ir/transforms/outline_incore_scopes_pass.cpp src/ir/transforms/outline_cluster_scopes_pass.cpp src/ir/transforms/outline_hierarchy_scopes_pass.cpp + src/ir/transforms/outline_incore_scopes_pass.cpp src/ir/transforms/expand_mixed_kernel_pass.cpp src/ir/transforms/split_vector_kernel_pass.cpp src/ir/transforms/flatten_tile_nd_to_2d_pass.cpp @@ -159,8 +159,6 @@ set(PYPTO_SOURCES src/ir/transforms/resolve_transpose_layout_pass.cpp src/ir/transforms/python_printer.cpp src/ir/transforms/simplify_pass.cpp - src/ir/transforms/split_chunked_loops_pass.cpp - src/ir/transforms/interchange_chunk_loops_pass.cpp src/ir/transforms/unroll_loops_pass.cpp src/ir/transforms/partial_unroll_tile_loops_pass.cpp src/ir/transforms/reorder_unrolled_io_pass.cpp diff --git a/docs/en/dev/ir/01-hierarchy.md b/docs/en/dev/ir/01-hierarchy.md index 18662121f..a09c7525d 100644 --- a/docs/en/dev/ir/01-hierarchy.md +++ b/docs/en/dev/ir/01-hierarchy.md @@ -32,7 +32,12 @@ This document provides a complete reference of all IR node types, organized by c ::= "return" [ ] ::= ::= { ";" } - ::= "with" "pl.incore" "(" ")" ":" + ::= "with" "pl.at" "(" "level" "=" [ "," "role" "=" ] + [ "," "optimizations" "=" "[" "]" ] ")" + ":" + | "with" "pl.cluster" "(" ")" ":" + | "with" "pl.spmd" "(" "core_num" "=" + [ "," "sync_start" "=" ] ")" ":" ::= "break" ::= "continue" @@ -153,10 +158,8 @@ field from the `Stmt` base class. See [Leading comments on statements](#leading- | **IfStmt** | `condition_`, `then_stmts_`, `else_stmts_`, `return_vars_` | Conditional branching | | **ForStmt** | `loop_var_` (DefField), `start_`, `stop_`, `step_`, `iter_args_` (DefField), `body_`, `return_vars_` (DefField), `kind_` | For loop with optional iteration args | | **WhileStmt** | `condition_`, `iter_args_` (DefField), `body_`, `return_vars_` (DefField) | While loop with condition and iteration args | -| **InCoreScopeStmt** | `name_hint_`, `body_`, `split_` (optional) | InCore region; outlined to `Function(InCore)` | -| **AutoInCoreScopeStmt** | `name_hint_`, `body_`, `split_` (optional) | Auto-InCore region; consumed by `InterchangeChunkLoops` | | **ClusterScopeStmt** | `name_hint_`, `body_` | Cluster region; outlined to `Function(Group)` | -| **HierarchyScopeStmt** | `name_hint_`, `body_`, `level_`, `role_` (optional) | Pipeline-stage region for a given Level/Role | +| **HierarchyScopeStmt** | `name_hint_`, `body_`, `level_`, `role_` (optional), `split_` (optional) | Pipeline-stage region for a given Level/Role; outlined to `Function(InCore)` when `level_ == CORE_GROUP` and to `Function(Opaque)` otherwise | | **SpmdScopeStmt** | `name_hint_`, `body_`, `core_num_`, `sync_start_` | SPMD launch region; outlined to `Function(Spmd)` | | **YieldStmt** | `values_` | Yield values in loop iteration | | **EvalStmt** | `expr_` | Evaluate expression for side effects | @@ -252,25 +255,22 @@ while_stmt = ir.WhileStmt(condition, [x_iter], body, [x_final], span) ### ScopeStmt Details `ScopeStmt` is an **abstract base class** that marks a region with a specific -execution context. The five concrete subclasses below each carry only the +execution context. The three concrete subclasses below each carry only the fields valid for their kind — invalid combinations are unrepresentable at construction. Use `s.scope_kind` (or `s.GetScopeKind()` in C++) to recover the -kind from a `ScopeStmt`-typed reference, or `isinstance(s, InCoreScopeStmt)` +kind from a `ScopeStmt`-typed reference, or `isinstance(s, HierarchyScopeStmt)` to dispatch on the concrete type. -All five share the common base fields `name_hint_: str` and `body_: StmtPtr`. -Note that `pl.at(level=Level.CORE_GROUP)` lowers to `InCoreScopeStmt` / -`AutoInCoreScopeStmt`, not `HierarchyScopeStmt` — the parser rejects `role=` -at `CORE_GROUP`. `HierarchyScopeStmt` is reserved for non-`CORE_GROUP` levels -(host, cluster, global) and is not a general replacement for in-core scopes. +All three share the common base fields `name_hint_: str` and `body_: StmtPtr`. +`pl.at(level=...)` always lowers to `HierarchyScopeStmt` — including the +`level=Level.CORE_GROUP` form, which produces a `HierarchyScopeStmt` with +`level_ == CORE_GROUP` and an optional `split_`. `OutlineIncoreScopes` +later turns that `CORE_GROUP` scope into a `Function(InCore)` and re-types +the parent `Opaque` function as `Orchestration`. Non-`CORE_GROUP` +`HierarchyScopeStmt`s are outlined into `Function(Opaque)` by +`OutlineHierarchyScopes` (which runs immediately before `OutlineIncoreScopes`). ```python -# with pl.incore(): y = pl.add(x, x) -in_core = ir.InCoreScopeStmt(name_hint="", body=body, span=span) - -# with pl.auto_incore(): (split is optional) -auto = ir.AutoInCoreScopeStmt(name_hint="", body=body, span=span) - # with pl.cluster(): cluster = ir.ClusterScopeStmt(name_hint="", body=body, span=span) @@ -278,6 +278,12 @@ cluster = ir.ClusterScopeStmt(name_hint="", body=body, span=span) hier = ir.HierarchyScopeStmt(level=ir.Level.HOST, role=ir.Role.Worker, name_hint="", body=body, span=span) +# with pl.at(level=Level.CORE_GROUP, +# optimizations=[pl.split(pl.SplitMode.UP_DOWN)]): +hier_core = ir.HierarchyScopeStmt(level=ir.Level.CORE_GROUP, + split=ir.SplitMode.UP_DOWN, + name_hint="", body=body, span=span) + # with pl.spmd(core_num=8): spmd = ir.SpmdScopeStmt(core_num=8, sync_start=False, name_hint="", body=body, span=span) @@ -289,20 +295,33 @@ spmd = ir.SpmdScopeStmt(core_num=8, sync_start=False, are not control flow (execute once, linearly). - Required fields are enforced at construction: `HierarchyScopeStmt.level_` is non-optional; `SpmdScopeStmt` rejects `core_num <= 0`. -- `InCoreScopeStmt` / `AutoInCoreScopeStmt` are scheduled for deprecation; - prefer `HierarchyScopeStmt` or other surviving kinds in new code. +- `HierarchyScopeStmt.split_` is optional and is only meaningful at + `Level.CORE_GROUP`. It is copied onto the outlined `InCore` function's + attrs so `ExpandMixedKernel` can read the hint. - Pass behavior: - - `InterchangeChunkLoops` consumes `AutoInCoreScopeStmt` - - `OutlineIncoreScopes` extracts `InCoreScopeStmt` into `Function(InCore)` + - `OutlineHierarchyScopes` extracts every non-`CORE_GROUP` + `HierarchyScopeStmt` into a dedicated `FunctionType::Opaque` function. + Parent function types are preserved. + - `OutlineIncoreScopes` (runs immediately after) extracts every + `CORE_GROUP` `HierarchyScopeStmt` into a dedicated `FunctionType::InCore` + function. Parents that contained at least one `CORE_GROUP` scope are + re-typed from `Opaque` to `Orchestration`. - `OutlineClusterScopes` extracts `ClusterScopeStmt` into `Function(Group)` - and standalone `SpmdScopeStmt` into `Function(Spmd)` - - `OutlineHierarchyScopes` extracts `HierarchyScopeStmt` + and standalone `SpmdScopeStmt` into `Function(Spmd)`. **Transformation:** ```python -# Before: with pl.incore(): y = pl.add(x, x); return y -# After: main_incore_0(x) -> y; main(x): y = main_incore_0(x); return y +# Before: +# def main(x): +# with pl.at(level=pl.Level.CORE_GROUP): +# y = pl.add(x, x) +# return y +# After: +# def main_core_group_0(x) -> y: ... # FunctionType.InCore +# def main(x) -> y: # FunctionType.Orchestration +# y = main_core_group_0(x) +# return y ``` **Parallel for loop (ForKind):** @@ -444,7 +463,7 @@ Functions stored in sorted map for deterministic ordering. GlobalVar names must | **Unary Ops** | 5 | Abs, Neg, Not, BitNot, Cast | | **Call/Access** | 2 | Call, TupleGetItemExpr | | **Operations** | 2 | Op, GlobalVar | -| **Statements** | 15 | AssignStmt, IfStmt, ForStmt, WhileStmt, ReturnStmt, InCoreScopeStmt, AutoInCoreScopeStmt, ClusterScopeStmt, HierarchyScopeStmt, SpmdScopeStmt, YieldStmt, EvalStmt, SeqStmts, BreakStmt, ContinueStmt | +| **Statements** | 13 | AssignStmt, IfStmt, ForStmt, WhileStmt, ReturnStmt, ClusterScopeStmt, HierarchyScopeStmt, SpmdScopeStmt, YieldStmt, EvalStmt, SeqStmts, BreakStmt, ContinueStmt | | **Types** | 6 | ScalarType, TensorType, TileType, TupleType, PipeType, UnknownType | | **Functions** | 2 | Function, Program | diff --git a/docs/en/dev/language/00-python_syntax.md b/docs/en/dev/language/00-python_syntax.md index 6a4033aed..5a882bca2 100644 --- a/docs/en/dev/language/00-python_syntax.md +++ b/docs/en/dev/language/00-python_syntax.md @@ -256,22 +256,17 @@ for i in pl.unroll(12, chunk=4): body_statements ``` -**Key points:** `chunk=C` splits the loop into an outer sequential loop and an inner loop of `C` iterations. The inner loop preserves the original kind (Sequential/Parallel/Unroll). `chunk` cannot be combined with `init_values`, and `chunk=` loops are only valid inside a `with pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.auto_chunk]):` — outside that scope the parser rejects them with an error. See [SplitChunkedLoops Pass](../passes/05-split_chunked_loops.md). +**Key points:** `chunk=C` splits the loop into an outer sequential loop and an inner loop of `C` iterations. The inner loop preserves the original kind (Sequential/Parallel/Unroll). `chunk` cannot be combined with `init_values`. ### Scope Context Managers -| Form | Scope Kind | Notes | -| ---- | ---------- | ----- | -| `pl.at(level=pl.Level.CORE_GROUP)` | `InCore` | Fixed-boundary outline at CORE_GROUP | -| `pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.split(MODE)])` | `InCore` | InCore + cross-core split hint | -| `pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.auto_chunk])` | `AutoInCore` | Compiler-driven chunked loop split | -| `pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.auto_chunk, pl.split(MODE)])` | `AutoInCore` | AutoInCore + split hint (independent entries) | -| `pl.at(level=pl.Level.HOST)` *(or any non-`CORE_GROUP` level)* | `Hierarchy` | Distributed hierarchy scope | -| `pl.cluster()` | `Cluster` | Co-scheduled AIC+AIV group | -| `pl.incore()` *(deprecated)* | `InCore` | Use `pl.at(level=pl.Level.CORE_GROUP)` instead | -| `pl.auto_incore(split=...)` *(deprecated)* | `AutoInCore` | Use `pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.auto_chunk, pl.split(...)])` | -| `pl.at(..., optimization=pl.chunked_loop_optimizer[(split=...)])` *(deprecated)* | `AutoInCore` | Use `pl.at(..., optimizations=[pl.auto_chunk, pl.split(...)])` | -| `pl.at(..., split=...)` *(deprecated)* | `InCore` | Use `pl.at(..., optimizations=[pl.split(...)])` | +| Form | Produces | Notes | +| ---- | -------- | ----- | +| `pl.at(level=pl.Level.CORE_GROUP)` | `HierarchyScopeStmt` (level=CORE_GROUP) | Outlined to `Function(InCore)` by `OutlineIncoreScopes`; parent `Opaque` is promoted to `Orchestration` | +| `pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.split(MODE)])` | `HierarchyScopeStmt` (level=CORE_GROUP, split=MODE) | Same as above; the split hint is carried on the outlined function and consumed by `ExpandMixedKernel` | +| `pl.at(level=pl.Level.HOST)` *(or any non-`CORE_GROUP` level)* | `HierarchyScopeStmt` (level=HOST/...) | Outlined to `Function(Opaque)` by `OutlineHierarchyScopes`; parent type preserved | +| `pl.cluster()` | `ClusterScopeStmt` | Outlined to `Function(Group)` by `OutlineClusterScopes` | +| `pl.spmd(core_num=N[, sync_start=...])` | `SpmdScopeStmt` | Standalone (non-cluster) spmd is outlined to `Function(Spmd)`; inside a cluster the attrs are hoisted onto the Group function | See [Language Guide](../../user/01-language_guide.md#incore-scopes) for examples. diff --git a/docs/en/dev/passes/00-pass_manager.md b/docs/en/dev/passes/00-pass_manager.md index fec4736b3..73851d4db 100644 --- a/docs/en/dev/passes/00-pass_manager.md +++ b/docs/en/dev/passes/00-pass_manager.md @@ -33,7 +33,7 @@ Framework for organizing and executing IR transformation passes on Programs with | `NoNestedCalls` | No nested call expressions | | `NormalizedStmtStructure` | Statement structure normalized | | `NoRedundantBlocks` | No single-child or nested SeqStmts | -| `SplitIncoreOrch` | InCore scopes outlined into separate functions | +| `HierarchyOutlined` | `HierarchyScopeStmt` regions outlined into functions (`Opaque` for non-CORE_GROUP via `OutlineHierarchyScopes`; `InCore` for `CORE_GROUP` via `OutlineIncoreScopes`); parent re-typed as `Orchestration` when a `CORE_GROUP` scope was outlined. Produced by `OutlineIncoreScopes` (the second of the two outline passes). | | `ClusterOutlined` | Cluster scopes outlined into Group functions | | `HasMemRefs` | MemRef objects initialized on variables | | `IncoreTileOps` | InCore functions use tile ops | @@ -61,21 +61,20 @@ struct PassProperties { | UnrollLoops | TypeChecked | TypeChecked | — | | CtrlFlowTransform | TypeChecked | TypeChecked, StructuredCtrlFlow | — | | ConvertToSSA | TypeChecked | TypeChecked, SSAForm | NormalizedStmtStructure | -| FlattenCallExpr | SSAForm | SSAForm, NoNestedCalls | NormalizedStmtStructure | -| SplitChunkedLoops | TypeChecked, SSAForm | TypeChecked, SSAForm | — | -| InterchangeChunkLoops | TypeChecked, SSAForm | TypeChecked, SSAForm | — | | NormalizeStmtStructure | TypeChecked | TypeChecked, NormalizedStmtStructure | — | -| OutlineIncoreScopes | TypeChecked, SSAForm | SplitIncoreOrch | — | +| FlattenCallExpr | SSAForm | SSAForm, NoNestedCalls | NormalizedStmtStructure | +| OutlineHierarchyScopes | SSAForm | SSAForm | — | +| OutlineIncoreScopes | SSAForm | SSAForm, HierarchyOutlined | — | | OutlineClusterScopes | TypeChecked, SSAForm | ClusterOutlined | — | -| ConvertTensorToTileOps | SplitIncoreOrch | IncoreTileOps | — | +| ConvertTensorToTileOps | HierarchyOutlined | IncoreTileOps | — | | FlattenTileNdTo2D | SSAForm, IncoreTileOps | SSAForm, TileOps2D | — | -| ResolveBackendOpLayouts | SSAForm, IncoreTileOps, SplitIncoreOrch, TileOps2D | SSAForm, IncoreTileOps, SplitIncoreOrch, TileOps2D | NormalizedStmtStructure | -| ExpandMixedKernel | SSAForm, IncoreTileOps, SplitIncoreOrch, TileOps2D | SSAForm, MixedKernelExpanded | — | -| NormalizeReturnOrder | SplitIncoreOrch, IncoreTileOps | — | — | -| InitMemRef | TypeChecked, SSAForm, SplitIncoreOrch, IncoreTileOps, TileOps2D | HasMemRefs | SSAForm | -| MemoryReuse | TypeChecked, SplitIncoreOrch, IncoreTileOps, HasMemRefs, TileOps2D | — | — | -| InsertSync | TypeChecked, SplitIncoreOrch, IncoreTileOps, HasMemRefs, TileOps2D | — | — | -| AllocateMemoryAddr | TypeChecked, SplitIncoreOrch, IncoreTileOps, HasMemRefs, TileOps2D | AllocatedMemoryAddr | — | +| ResolveBackendOpLayouts | SSAForm, IncoreTileOps, HierarchyOutlined, TileOps2D | SSAForm, IncoreTileOps, HierarchyOutlined, TileOps2D | NormalizedStmtStructure | +| ExpandMixedKernel | SSAForm, IncoreTileOps, HierarchyOutlined, TileOps2D | SSAForm, MixedKernelExpanded | — | +| NormalizeReturnOrder | HierarchyOutlined, IncoreTileOps | — | — | +| InitMemRef | TypeChecked, SSAForm, HierarchyOutlined, IncoreTileOps, TileOps2D | HasMemRefs | SSAForm | +| MemoryReuse | TypeChecked, HierarchyOutlined, IncoreTileOps, HasMemRefs, TileOps2D | — | — | +| InsertSync | TypeChecked, HierarchyOutlined, IncoreTileOps, HasMemRefs, TileOps2D | — | — | +| AllocateMemoryAddr | TypeChecked, HierarchyOutlined, IncoreTileOps, HasMemRefs, TileOps2D | AllocatedMemoryAddr | — | | FuseCreateAssembleToSlice | — | — | — | | Simplify | — | — | — | diff --git a/docs/en/dev/passes/01-unroll_loops.md b/docs/en/dev/passes/01-unroll_loops.md index cfe582659..8f994c3e5 100644 --- a/docs/en/dev/passes/01-unroll_loops.md +++ b/docs/en/dev/passes/01-unroll_loops.md @@ -77,10 +77,10 @@ class After: UnrollLoops runs **once** in `Default` and `DebugTileOptimization`, before control flow structuring: ```text -UnrollLoops → CtrlFlowTransform → ConvertToSSA → FlattenCallExpr → SplitChunkedLoops → InterchangeChunkLoops → OutlineIncoreScopes → ... +UnrollLoops → CtrlFlowTransform → ConvertToSSA → NormalizeStmtStructure → FlattenCallExpr → OutlineHierarchyScopes → OutlineIncoreScopes → OutlineClusterScopes → ... ``` -UnrollLoops expands non-chunked `pl.unroll()` loops (skipping chunked unroll loops which retain `chunk` for `SplitChunkedLoops` to handle later). +UnrollLoops expands `pl.unroll()` loops into their inline body copies. ## Pass Properties diff --git a/docs/en/dev/passes/02-ctrl_flow_transform.md b/docs/en/dev/passes/02-ctrl_flow_transform.md index 4c097a185..4ebf2571f 100644 --- a/docs/en/dev/passes/02-ctrl_flow_transform.md +++ b/docs/en/dev/passes/02-ctrl_flow_transform.md @@ -163,7 +163,7 @@ while i < n and not __break_0: CtrlFlowTransform runs after UnrollLoops and before ConvertToSSA: ```text -UnrollLoops -> CtrlFlowTransform -> ConvertToSSA -> FlattenCallExpr -> SplitChunkedLoops -> ... +UnrollLoops -> CtrlFlowTransform -> ConvertToSSA -> NormalizeStmtStructure -> FlattenCallExpr -> OutlineHierarchyScopes -> ... ``` ## Pass Properties diff --git a/docs/en/dev/passes/03-convert_to_ssa.md b/docs/en/dev/passes/03-convert_to_ssa.md index da0db1c03..2a52dd733 100644 --- a/docs/en/dev/passes/03-convert_to_ssa.md +++ b/docs/en/dev/passes/03-convert_to_ssa.md @@ -13,7 +13,7 @@ This pass transforms IR with multiple assignments to the same variable into SSA **Requires**: `TypeChecked` property. `TypeChecked` is verified automatically at BASIC level once produced; use a `VerificationInstrument` via `PassContext` to validate required properties before this pass runs. -**When to use**: Run this pass before any optimization or analysis that requires SSA form (e.g., OutlineIncoreScopes, memory optimization passes). +**When to use**: Run this pass before any optimization or analysis that requires SSA form (e.g., OutlineHierarchyScopes, memory optimization passes). ## API diff --git a/docs/en/dev/passes/05-outline_hierarchy_scopes.md b/docs/en/dev/passes/05-outline_hierarchy_scopes.md new file mode 100644 index 000000000..48b97ca31 --- /dev/null +++ b/docs/en/dev/passes/05-outline_hierarchy_scopes.md @@ -0,0 +1,197 @@ +# OutlineHierarchyScopes Pass + +Outlines non-`CORE_GROUP` `HierarchyScopeStmt` regions into separate +`Opaque` functions, carrying the scope's level/role metadata onto the +outlined function. + +## Overview + +This pass transforms each `HierarchyScopeStmt` whose `level_` is not +`Level.CORE_GROUP` into a dedicated `Function` definition and replaces the +scope with a `Call` to that function. The outlined function is always typed +`FunctionType::Opaque`; the parent function's type is preserved. + +| Scope `level_` | Handled by this pass | Outlined function type | Parent function type after pass | +| -------------- | -------------------- | ---------------------- | ------------------------------- | +| `Level.HOST`, `Level.CLUSTER`, `Level.GLOBAL`, ... | Yes | `FunctionType::Opaque` | unchanged (preserved) | +| `Level.CORE_GROUP` | **No — intentionally left alone** | *(handled by [`OutlineIncoreScopes`](06-outline_incore_scopes.md))* | *(promoted to `Orchestration` by the next pass)* | + +`CORE_GROUP` scopes are intentionally left untouched here; the immediately +following pass, [`OutlineIncoreScopes`](06-outline_incore_scopes.md), +outlines them into `Function(InCore)` and promotes the parent function from +`Opaque` to `Orchestration`. + +**Requirements**: + +- Input IR must be in SSA form (run `ConvertToSSA` first). SSA form is + preserved (produced) by this pass. +- Processes `Opaque` functions. Functions already typed as + `Orchestration`, `InCore`, `AIC`, `AIV`, or `Group` are left untouched. + +**When to use**: Run after `ConvertToSSA`/`FlattenCallExpr` when the IR +contains `with pl.at(level=...):` scopes for non-`CORE_GROUP` levels that +need to be extracted into callable helper functions. + +## API + +| C++ | Python | Level | +| --- | ------ | ----- | +| `pass::OutlineHierarchyScopes()` | `passes.outline_hierarchy_scopes()` | Program-level | + +**Factory function**: + +```cpp +Pass OutlineHierarchyScopes(); +``` + +**Python usage**: + +```python +from pypto.pypto_core import passes + +outline_pass = passes.outline_hierarchy_scopes() +program_outlined = outline_pass(program) +``` + +## Algorithm + +1. **Scan for Hierarchy Scopes**: Find every `HierarchyScopeStmt` inside each + `Opaque` function body whose `level_` is **not** `CORE_GROUP`. +2. **Analyze Inputs/Outputs**: Use the shared scope-outline helpers to compute + the set of variables defined outside but used inside (inputs) and defined + inside but used outside (outputs). +3. **Create Outlined Function**: Extract the scope body into a new `Function`: + - Parameters = input variables + - Returns = output variables + - Body = the scope body + - `func_type_` = `Opaque` + - Copy `role_` metadata into function attrs. +4. **Replace the Scope**: Substitute the original `HierarchyScopeStmt` with + a `Call` to the outlined function followed by `AssignStmt`s that bind its + return values. +5. **Preserve Parent Type**: The parent function's `func_type_` is not + changed by this pass. Parent type promotion for `CORE_GROUP` scopes is + the responsibility of [`OutlineIncoreScopes`](06-outline_incore_scopes.md). +6. **Add to Program**: Prepend the outlined functions to the program's + function list. + +**Naming**: `{original_func}_{level}_{counter}` (e.g. `main_host_0`, +`main_global_0`). When `HierarchyScopeStmt.name_hint` is non-empty the hint +is used directly. + +## Example + +### Non-CORE_GROUP level (HOST) + +**Before**: + +```python +@pl.program +class Before: + @pl.function # Opaque + def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: + with pl.at(level=pl.Level.HOST): + y = helper(x) + return y +``` + +**After** (parent stays `Opaque`, outlined function is `Opaque`): + +```python +@pl.program +class After: + @pl.function # unchanged + def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: + y = self.main_host_0(x) + return y + + @pl.function # Opaque + def main_host_0(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: + y = helper(x) + return y +``` + +### Multiple outputs + +```python +with pl.at(level=pl.Level.HOST): + a_tile = pl.load(a, [0], [64]) + b_tile = pl.load(b, [0], [64]) + c_tile = pl.add(a_tile, b_tile) + out_a = pl.store(c_tile, [0], out) + out_b = pl.mul(c_tile, 2.0) +# both out_a and out_b used after the scope +x = out_a + out_b +``` + +After outlining, the body becomes: + +```python +out_a, out_b = self.main_host_0(a, b, out) # multiple return values +x = out_a + out_b +``` + +### CORE_GROUP scopes are skipped + +```python +@pl.function # Opaque +def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: + with pl.at(level=pl.Level.CORE_GROUP): # <-- NOT outlined here + tile = pl.load(x, [0], [64]) + result = pl.store(tile, [0], x) + return result +``` + +This pass leaves the `CORE_GROUP` scope in place. The next pipeline pass, +[`OutlineIncoreScopes`](06-outline_incore_scopes.md), outlines it into +`Function(InCore)` and promotes the parent to `Orchestration`. + +## Implementation + +**Header**: `include/pypto/ir/transforms/passes.h` + +```cpp +Pass OutlineHierarchyScopes(); +``` + +**Implementation**: `src/ir/transforms/outline_hierarchy_scopes.cpp` + +- Uses the shared `scope_outline_utils` to compute inputs/outputs +- Builds a new `Function(Opaque)` per non-`CORE_GROUP` scope +- Copies `role_` metadata onto the outlined function's attrs +- Never modifies the parent function's `func_type_` + +**Python binding**: `python/bindings/modules/passes.cpp` + +```cpp +passes.def("outline_hierarchy_scopes", &pass::OutlineHierarchyScopes, + "Outline non-CORE_GROUP HierarchyScopeStmt regions into Opaque functions"); +``` + +**Tests**: `tests/ut/ir/transforms/test_outline_hierarchy_scopes.py` + +- Tests non-`CORE_GROUP` scope → `Opaque` function + parent unchanged +- Tests that `CORE_GROUP` scopes are left in place +- Tests input/output analysis +- Tests multiple non-`CORE_GROUP` scopes in the same parent function +- Tests SSA preservation + +## Pass Properties + +| Property | Value | +| -------- | ----- | +| Required | `SSAForm` | +| Produced | `SSAForm` | +| Invalidated | — | + +`HierarchyOutlined` is produced by +[`OutlineIncoreScopes`](06-outline_incore_scopes.md), which runs next and +handles the remaining `CORE_GROUP` scopes. + +## Pipeline Position + +```text +... → ConvertToSSA → NormalizeStmtStructure → FlattenCallExpr → +OutlineHierarchyScopes → OutlineIncoreScopes → OutlineClusterScopes → +ConvertTensorToTileOps → ... +``` diff --git a/docs/en/dev/passes/05-split_chunked_loops.md b/docs/en/dev/passes/05-split_chunked_loops.md deleted file mode 100644 index 713f1fa4b..000000000 --- a/docs/en/dev/passes/05-split_chunked_loops.md +++ /dev/null @@ -1,187 +0,0 @@ -# SplitChunkedLoops Pass - -Splits loops with `chunk` into nested outer/inner loops under one of two policies. - -## Overview - -This pass transforms a for loop created with `chunk=C` into a pair of nested loops: an outer loop over chunk indices and an inner loop iterating within each chunk. Two codegen policies are supported: - -- **`guarded`** (default) — emit a single outer loop of `ceil(T/C)` chunks plus an inner loop of `C`, and wrap the body in `if (idx < stop)` (or `idx > stop` for negative step). Out-of-range iterations become no-ops. A single kernel is emitted. -- **`leading_full`** — emit a full-chunk loop of `T/C` chunks plus a separate remainder loop of `T % C` iterations. Two sibling loops are emitted. - -Both policies run after SSA conversion and propagate `iter_args` through the generated loops. - -**Requires**: `TypeChecked`, `SSAForm`. - -**When to use**: Runs automatically in the default pipeline after `FlattenCallExpr` and before `InterchangeChunkLoops`. Use `chunk=` on `pl.range()`, `pl.parallel()`, or `pl.unroll()` inside a `with pl.auto_incore():` scope. Chunked loops outside `auto_incore` are not split. - -## API - -| C++ | Python | Level | -| --- | ------ | ----- | -| `pass::SplitChunkedLoops()` | `passes.split_chunked_loops()` | Function-level | - -```python -from pypto import passes -result = passes.split_chunked_loops()(program) -``` - -## DSL Syntax - -Chunked loops must be wrapped in `with pl.auto_incore():`: - -```python -with pl.auto_incore(): - # Default (guarded): single kernel with if-guard - for i in pl.range(10, chunk=5): - x = pl.add(x, 1.0) - - # Explicit guarded (same as default) - for i in pl.parallel(n, chunk=4, chunk_policy="guarded"): - x = pl.add(x, 1.0) - - # Explicit leading_full: peels remainder into separate loop - for i in pl.range(7, chunk=5, chunk_policy="leading_full"): - x = pl.add(x, 1.0) - - # iter_args are supported under both policies - for i, (s,) in pl.range(10, init_values=(x,), chunk=5): - s = pl.add(s, 1.0) - s = pl.yield_(s) -``` - -## Choosing a Policy - -| Criterion | Prefer `guarded` | Prefer `leading_full` | -| --------- | ---------------- | --------------------- | -| Dynamic bound (`stop` not a compile-time constant) | ✅ — single kernel preserves loop-carried state across the boundary | ❌ — remainder kernel receives iter_args as input-only copies, breaking cross-iteration accumulation | -| Static bound, trip_count known divisible | Slightly redundant guard | ✅ — no guard, no remainder | -| Want minimum kernel count under `pl.auto_incore()` | ✅ | Produces 2 kernels per chunked loop | -| Want to eliminate masked iterations inside the hot loop | ❌ | ✅ — full chunks run unconditionally | - -`guarded` is the default because (1) it preserves `add_inout()` accumulation under dynamic bounds and (2) it avoids doubling the kernel count under `pl.auto_incore()`. - -## Constraints - -| Constraint | Reason | -| ---------- | ------ | -| `step`, `chunk` must be integer constants | Needed at compile time | -| `chunk` must be a positive integer | Non-positive sizes are invalid | -| `step` may be negative (descending loop) | `guarded` adapts the predicate to the step sign | -| `start`, `stop` may be dynamic expressions under `guarded` | Trip count becomes `max(abs(stop - start), 0) / abs(step)` | -| Chunked loop must be inside `pl.auto_incore()` | Only `auto_incore`-scoped loops are split | -| `chunk` may be combined with `init_values` | Both policies thread iter_args through the generated loops | - -## Algorithm - -Let `T = ceil(max(|stop - start|, 0) / |step|)` and `C = chunk`. - -### `guarded` (default) - -1. `n_total = ceil(T / C)` — static when bounds are const, otherwise `(T + C - 1) // C`. -2. Emit outer loop `for out_var in [0, n_total)` and inner loop `for in_var in [0, C)`. -3. Compute `idx = start + (out_var * C + in_var) * step` (substituted into body). -4. Wrap the visited body in an `IfStmt` whose condition is: - - `idx < stop` when `step > 0` - - `idx > stop` when `step < 0` -5. **Without iter_args** — IfStmt has no else branch; skipped iterations are no-ops. -6. **With iter_args** — IfStmt gets `return_vars` acting as phi nodes: the then-branch keeps the user body's trailing `YieldStmt` (updated values), the else-branch yields the inner iter_args unchanged. The inner loop's trailing `YieldStmt` references the IfStmt's phi vars, so loop-carried state threads through both guarded and skipped iterations. - -### `leading_full` - -1. `n_full = T // C`, `n_rem = T % C`. -2. Emit outer loop `for out_var in [0, n_full)` and inner loop `for in_var in [0, C)` with `idx = start + (out_var * C + in_var) * step`. Skip if `n_full == 0`. -3. If `n_rem > 0`, emit a remainder loop `for rem_var in [0, n_rem)` with `idx = start + (n_full * C + rem_var) * step`. Its `init_values` chain from the outer loop's `return_vars` (or from the original init if no full-chunk loop was emitted). -4. Remap the original `return_vars` to the final loop's `return_vars`. - -Both paths preserve the original `ForKind` (Sequential, Parallel, or Unroll) on inner and outer/remainder loops. - -## Auto-Name Abbreviations - -Printed IR uses the compact auto-name format `base__qualifier_role_vN`. Abbreviated qualifiers: - -| Abbreviation | Meaning | Emitted by | -| ------------ | ------- | ---------- | -| `co` | chunk_outer | both policies | -| `ci` | chunk_inner | both policies | -| `cr` | chunk_rem (remainder) | `leading_full` only | -| `cg` | chunk_guard (IfStmt phi) | `guarded` with iter_args only | - -Examples: `i__co_idx_v0` (outer index), `x__ci_iter_v1` (inner iter_arg), `x__cr_rv_v1` (remainder return var), `x__cg_rv_v1` (IfStmt phi var). - -## Examples - -### `guarded`, divisible (`chunk=5`, trip_count=10) - -**After**: - -```python -for i__co_idx_v0, (x__co_iter_v1,) in pl.range(2, init_values=(x__ssa_v0,)): - for i__ci_idx_v0, (x__ci_iter_v1,) in pl.range(5, init_values=(x__co_iter_v1,)): - if i__co_idx_v0 * 5 + i__ci_idx_v0 < 10: - x__ssa_v3 = pl.tensor.add(x__ci_iter_v1, 1.0) - x__cg_rv_v1 = pl.yield_(x__ssa_v3) - else: - x__cg_rv_v1 = pl.yield_(x__ci_iter_v1) - x__ci_rv_v1 = pl.yield_(x__cg_rv_v1) - x__co_rv_v1 = pl.yield_(x__ci_rv_v1) -return x__co_rv_v1 -``` - -### `guarded`, dynamic bound (`chunk=4`, `stop=n`) - -**After** (single kernel, `n_total = (n + 3) // 4`): - -```python -for i__co_idx_v0, (x__co_iter_v1,) in pl.range((n + 3) // 4, init_values=(x__ssa_v0,)): - for i__ci_idx_v0, (x__ci_iter_v1,) in pl.range(4, init_values=(x__co_iter_v1,)): - if i__co_idx_v0 * 4 + i__ci_idx_v0 < n: - x__ssa_v3 = pl.tensor.add(x__ci_iter_v1, 1.0) - x__cg_rv_v1 = pl.yield_(x__ssa_v3) - else: - x__cg_rv_v1 = pl.yield_(x__ci_iter_v1) - x__ci_rv_v1 = pl.yield_(x__cg_rv_v1) - x__co_rv_v1 = pl.yield_(x__ci_rv_v1) -return x__co_rv_v1 -``` - -### `leading_full`, non-divisible (`chunk=5`, trip_count=7) - -**After** (two sibling loops): - -```python -for i__co_idx_v0, (x__co_iter_v1,) in pl.range(1, init_values=(x__ssa_v0,)): - for i__ci_idx_v0, (x__ci_iter_v1,) in pl.range(5, init_values=(x__co_iter_v1,)): - x__ssa_v3 = pl.tensor.add(x__ci_iter_v1, 1.0) - x__ci_rv_v1 = pl.yield_(x__ssa_v3) - x__co_rv_v1 = pl.yield_(x__ci_rv_v1) -for i__cr_idx_v0, (x__cr_iter_v1,) in pl.range(2, init_values=(x__co_rv_v1,)): - x__ssa_v4 = pl.tensor.add(x__cr_iter_v1, 1.0) - x__cr_rv_v1 = pl.yield_(x__ssa_v4) -return x__cr_rv_v1 -``` - -## LoopOrigin Tagging - -| LoopOrigin | Description | Emitted by | -| ---------- | ----------- | ---------- | -| `Original` | Regular user loop (default) | — | -| `ChunkOuter` | Outer loop over chunk indices | both policies | -| `ChunkInner` | Inner loop within a chunk | both policies | -| `ChunkRemainder` | Remainder loop for leftover iterations | `leading_full` only | - -Access via `for_stmt.attrs.get("loop_origin")` (Python) or `for_stmt->GetAttr("loop_origin")` (C++). - -## Pipeline Position - -```text -UnrollLoops → ConvertToSSA → FlattenCallExpr → SplitChunkedLoops → InterchangeChunkLoops → OutlineIncoreScopes → ... -``` - -## Pass Properties - -| Property | Value | -| -------- | ----- | -| Required | `TypeChecked`, `SSAForm` | -| Produced | `TypeChecked`, `SSAForm` | -| Invalidated | (none) | diff --git a/docs/en/dev/passes/06-interchange_chunk_loops.md b/docs/en/dev/passes/06-interchange_chunk_loops.md deleted file mode 100644 index b6547b6af..000000000 --- a/docs/en/dev/passes/06-interchange_chunk_loops.md +++ /dev/null @@ -1,197 +0,0 @@ -# InterchangeChunkLoops Pass - -Reorders nested ChunkOuter/ChunkInner loop pairs and inserts `InCore` scopes for downstream outlining. - -## Overview - -After `SplitChunkedLoops` splits chunked loops into nested `ChunkOuter→ChunkInner` pairs, the structure for nested chunked loops is: - -```text -i_out[ChunkOuter] → i_in[ChunkInner,Parallel] → j_out[ChunkOuter] → j_in[ChunkInner,Parallel] → body -``` - -This pass reorders so all outer loops are on top and wraps the inner loops + body in `InCoreScopeStmt`: - -```text -i_out[ChunkOuter] → j_out[ChunkOuter] → InCore{ i_in[ChunkInner] → j_in[ChunkInner] → body } -``` - -**Requires**: TypeChecked, SSAForm properties. - -**When to use**: Runs automatically in the default pipeline after `SplitChunkedLoops` and before `OutlineIncoreScopes`. Only operates on loops inside `pl.auto_incore()` scope. The `AutoInCore` scope is consumed (removed) by this pass. - -## API - -| C++ | Python | Level | -| --- | ------ | ----- | -| `pass::InterchangeChunkLoops()` | `passes.interchange_chunk_loops()` | Function-level | - -**Python usage**: - -```python -from pypto import passes - -result = passes.interchange_chunk_loops()(program) -``` - -## Constraints - -| Constraint | Behavior | -| ---------- | -------- | -| SSA-only | Runs after `SplitChunkedLoops` (requires `SSAForm`) | -| Parallel-only interchange | Only interchanges when ALL ChunkInner loops have `ForKind::Parallel` | -| Sequential chunked loops | Not interchanged, but wrapped in InCore if inside `auto_incore` | -| Existing InCore | If chain body already contains `InCoreScopeStmt`, skip | -| Requires `auto_incore` scope | Only loops inside `AutoInCoreScopeStmt` are processed; the scope is consumed | - -## Algorithm - -1. **Collect chain** — Starting from a `ChunkOuter` ForStmt, walk into nested ForStmt body. Build list of `(ForStmt, LoopOrigin)` entries. Stop at non-ForStmt, `Original` loop, or `ScopeStmt`. - -2. **Guard checks** — Verify all ChunkInner loops are Parallel. Check no existing InCore scope in innermost body. - -3. **Separate** — Split chain into `outers` (ChunkOuter) and `inners` (ChunkInner). - -4. **Reconstruct** (inside-out build): - - Visit the innermost body - - Wrap inners around body (preserving order), reconnecting iter_args - - Wrap in `InCoreScopeStmt` - - Wrap outers around InCore (preserving order), reconnecting iter_args and yields - -5. **Handle remainders** — `ChunkRemainder` loops: recurse into body. Wrap standalone parallel remainder sub-loops in InCore. - -## Auto-Name Abbreviations - -The examples below use compact qualifiers inside `base__qualifier_role_vN` names: - -| Abbreviation | Meaning | -| ------------ | ------- | -| `co` | `chunk_outer` | -| `ci` | `chunk_inner` | -| `cr` | `chunk_rem` / chunk remainder | -| `lN` | interchange loop level `N` | - -Examples: - -- `x__co_iter_v1` = chunk-outer iter_arg before interchange -- `x__co_l0_iter_v1` = loop-threaded iter_arg after interchange, level 0 -- `x__co_l2_rv_v1` = return var flowing out of reordered level 2 - -Roles such as `iter`, `rv`, `idx`, and `ssa` remain unabridged so the variable's purpose stays obvious. - -## Example - -**Before** (after SplitChunkedLoops, all parallel): - -```python -for i__co_idx_v0, (x__co_iter_v1,) in pl.range(2, init_values=(x__ssa_v0,)): # ChunkOuter - for i__ci_idx_v0, (x__ci_iter_v1,) in pl.parallel( - 4, init_values=(x__co_iter_v1,) - ): # ChunkInner - for j__co_idx_v0, (y__co_iter_v1,) in pl.range( - 3, init_values=(x__ci_iter_v1,) - ): # ChunkOuter - for j__ci_idx_v0, (y__ci_iter_v1,) in pl.parallel( - 4, init_values=(y__co_iter_v1,) - ): # ChunkInner - z = pl.add(y__ci_iter_v1, 1.0) - y__ci_rv_v1 = pl.yield_(z) - y__co_rv_v1 = pl.yield_(y__ci_rv_v1) - x__ci_rv_v1 = pl.yield_(y__co_rv_v1) - x__co_rv_v1 = pl.yield_(x__ci_rv_v1) -return x__co_rv_v1 -``` - -**After** (InterchangeChunkLoops): - -```python -for i__co_idx_v0, (x__co_l0_iter_v1,) in pl.range( - 2, init_values=(x__ssa_v0,) -): # ChunkOuter - for j__co_idx_v0, (x__co_l1_iter_v1,) in pl.range( - 3, init_values=(x__co_l0_iter_v1,) - ): # ChunkOuter - with pl.incore(): # InCore inserted - for i__ci_idx_v0, (x__co_l2_iter_v1,) in pl.parallel( - 4, init_values=(x__co_l1_iter_v1,) - ): # ChunkInner - for j__ci_idx_v0, (x__co_l3_iter_v1,) in pl.parallel( - 4, init_values=(x__co_l2_iter_v1,) - ): # ChunkInner - z = pl.add(x__co_l3_iter_v1, 1.0) - x__co_l3_rv_v1 = pl.yield_(z) - x__co_l2_rv_v1 = pl.yield_(x__co_l3_rv_v1) - x__co_l1_rv_v1 = pl.yield_(x__co_l2_rv_v1) - x__co_l0_rv_v1 = pl.yield_(x__co_l1_rv_v1) -return x__co_l0_rv_v1 -``` - -## Remainder Handling - -For non-divisible trip counts, remainder loops get InCore wrapping: - -```python -for i_rem, (...) in pl.parallel(2, init_values=(...)): # ChunkRemainder - for j_out, (...) in pl.range(3, init_values=(...)): # Interchange applied - with pl.incore(): - for j_in, (...) in pl.parallel(4, init_values=(...)): - body - with pl.incore(): # Remainder wrapped - for j_rem, (...) in pl.parallel(2, init_values=(...)): - body -``` - -## Non-Chunk Statement Handling - -When `auto_incore` is consumed, statements that were not handled by chunk interchange (standalone tensor ops, non-chunked loops, sequential chunked loops that failed the parallel guard) are wrapped in `InCoreScopeStmt` to ensure they get outlined into InCore functions by `OutlineIncoreScopes`. - -Consecutive non-InCore statements are grouped into a single `InCoreScopeStmt`. Control flow statements (`YieldStmt`, `ReturnStmt`) and pure scalar assignments (e.g., index arithmetic like `offset = ob * 32`) are never wrapped — they stay in the orchestration scope. - -**Example** — standalone op + parallel chunk: - -```python -# Before (inside auto_incore, after SplitChunkedLoops) -with pl.auto_incore(): - x = pl.add(x, 1.0) # standalone op - for i_out in pl.range(2): # ChunkOuter (parallel inner) - for i_in in pl.parallel(4): - x = pl.add(x, 2.0) - -# After InterchangeChunkLoops -with pl.incore(): # standalone wrapped - x = pl.add(x, 1.0) -for i_out in pl.range(2): # interchanged chunk - with pl.incore(): - for i_in in pl.parallel(4): - x = pl.add(x, 2.0) -``` - -**Example** — sequential chunk (fails interchange guard): - -```python -# Before -with pl.auto_incore(): - for i_out in pl.range(2): # ChunkOuter (sequential inner) - for i_in in pl.range(4): # ChunkInner, Sequential → fails guard - x = pl.add(x, 1.0) - -# After — entire chain wrapped in InCore -with pl.incore(): - for i_out in pl.range(2): - for i_in in pl.range(4): - x = pl.add(x, 1.0) -``` - -## Pipeline Position - -```text -UnrollLoops → ConvertToSSA → FlattenCallExpr → SplitChunkedLoops → InterchangeChunkLoops → OutlineIncoreScopes → ... -``` - -## Pass Properties - -| Property | Value | -| -------- | ----- | -| Required | `TypeChecked`, `SSAForm` | -| Produced | `TypeChecked`, `SSAForm` | -| Invalidated | (none) | diff --git a/docs/en/dev/passes/06-outline_incore_scopes.md b/docs/en/dev/passes/06-outline_incore_scopes.md new file mode 100644 index 000000000..ec1f5c1a8 --- /dev/null +++ b/docs/en/dev/passes/06-outline_incore_scopes.md @@ -0,0 +1,228 @@ +# OutlineIncoreScopes Pass + +Outlines `HierarchyScopeStmt` regions with `level_ == CORE_GROUP` into +dedicated `Function(InCore)` definitions and promotes the enclosing parent +function from `Opaque` to `Orchestration`. + +## Overview + +This pass specifically targets the `CORE_GROUP` form of +`HierarchyScopeStmt` — the per-core-group kernel region introduced by +`with pl.at(level=pl.Level.CORE_GROUP):`. Each such scope is extracted +into a new `Function` whose `func_type_` is `FunctionType::InCore`, and the +original scope is replaced with a `Call` to that outlined function. Whenever +any `CORE_GROUP` scope is outlined out of a given parent function, that +parent's `func_type_` is promoted from `Opaque` to `Orchestration`. + +This pass is the CORE_GROUP counterpart of +[`OutlineHierarchyScopes`](05-outline_hierarchy_scopes.md), which handles +the remaining (non-CORE_GROUP) hierarchy levels by emitting +`Function(Opaque)` and leaving the parent type alone. + +| Scope `level_` | Outlined function type | Parent function type after pass | +| -------------- | ---------------------- | ------------------------------- | +| `Level.CORE_GROUP` | `FunctionType::InCore` | promoted `Opaque` → `Orchestration` | +| any other level | *(not handled — already outlined by `OutlineHierarchyScopes`)* | — | + +When a `CORE_GROUP` scope carries a `split_` optimization hint, the hint is +attached to the outlined `InCore` function as a `split` attribute so that +downstream passes — notably +[`ExpandMixedKernel`](11-expand_mixed_kernel.md) — can honour it when +deciding how to split the kernel into AIC / AIV halves. + +**Requirements**: + +- Input IR must be in SSA form (run `ConvertToSSA` first). SSA form is + preserved (produced) by this pass. +- Expects `OutlineHierarchyScopes` to have already run, so only + `CORE_GROUP` `HierarchyScopeStmt` nodes remain to be outlined. +- Only processes `Opaque` functions (which may contain residual + `CORE_GROUP` scopes). Functions already typed as `Orchestration`, + `InCore`, `AIC`, `AIV`, or `Group` are left untouched. + +**When to use**: Run immediately after +[`OutlineHierarchyScopes`](05-outline_hierarchy_scopes.md) and before +[`OutlineClusterScopes`](07-outline_cluster_scopes.md). By the time this +pass finishes, the `HierarchyOutlined` property holds: no +`HierarchyScopeStmt` nodes remain in `Opaque` or `Orchestration` functions. + +## API + +| C++ | Python | Level | +| --- | ------ | ----- | +| `pass::OutlineIncoreScopes()` | `passes.outline_incore_scopes()` | Program-level | + +**Factory function**: + +```cpp +Pass OutlineIncoreScopes(); +``` + +**Python usage**: + +```python +from pypto.pypto_core import passes + +outline_pass = passes.outline_incore_scopes() +program_outlined = outline_pass(program) +``` + +## Algorithm + +1. **Scan for CORE_GROUP Scopes**: Find every `HierarchyScopeStmt` in each + `Opaque` function body whose `level_ == CORE_GROUP`. +2. **Analyze Inputs/Outputs**: Use the shared scope-outline helpers to + compute the set of variables defined outside but used inside (inputs) + and defined inside but used outside (outputs). +3. **Create Outlined InCore Function**: Extract the scope body into a new + `Function`: + - Parameters = input variables + - Returns = output variables + - Body = the scope body + - `func_type_` = `InCore` + - Copy `role_` into function attrs. + - If the scope carries a `split_` optimization hint, copy it into the + function's `split` attr (consumed by `ExpandMixedKernel`). +4. **Replace the Scope**: Substitute the original `HierarchyScopeStmt` + with a `Call` to the outlined InCore function followed by `AssignStmt`s + that bind its return values. +5. **Promote Parent**: If any `CORE_GROUP` scope was outlined from the + parent function, re-type that parent from `Opaque` to `Orchestration`. +6. **Add to Program**: Prepend the outlined InCore functions to the + program's function list. + +**Naming**: `{original_func}_core_group_{counter}` (e.g. +`main_core_group_0`). Outlined InCore functions use a `_incore_`-style +name suffix in their attrs and are easily identifiable in printed IR. When +`HierarchyScopeStmt.name_hint` is non-empty the hint is used directly. + +## Example + +### CORE_GROUP → InCore + Orchestration + +**Before** (after `OutlineHierarchyScopes`, non-CORE_GROUP scopes are +already outlined; the CORE_GROUP scope still sits inline in `main`): + +```python +@pl.program +class Before: + @pl.function # Opaque + def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: + y = x + 1 + + with pl.at(level=pl.Level.CORE_GROUP): + tile = pl.load(y, [0], [64]) + tile_sq = pl.mul(tile, tile) + result_tile = tile_sq + 1 + result = pl.store(result_tile, [0], x) + + z = result + 2 + return z +``` + +**After**: + +```python +@pl.program +class After: + @pl.function(type=pl.FunctionType.Orchestration) # promoted + def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: + y = x + 1 + result = self.main_core_group_0(y, x) # Call to outlined InCore fn + z = result + 2 + return z + + @pl.function(type=pl.FunctionType.InCore) # outlined + def main_core_group_0(self, y: pl.Tensor[[64], pl.FP32], + x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: + tile = pl.load(y, [0], [64]) + tile_sq = pl.mul(tile, tile) + result_tile = tile_sq + 1 + result = pl.store(result_tile, [0], x) + return result +``` + +### CORE_GROUP with split hint + +```python +with pl.at(level=pl.Level.CORE_GROUP, + optimizations=[pl.split(pl.SplitMode.UP_DOWN)]): + ... +``` + +The outlined `InCore` function receives the `split` hint in its attrs, +which `ExpandMixedKernel` later reads to split the kernel into AIC + AIV +halves. + +### Multiple outputs + +```python +with pl.at(level=pl.Level.CORE_GROUP): + a_tile = pl.load(a, [0], [64]) + b_tile = pl.load(b, [0], [64]) + c_tile = pl.add(a_tile, b_tile) + out_a = pl.store(c_tile, [0], out) + out_b = pl.mul(c_tile, 2.0) +# both out_a and out_b used after the scope +x = out_a + out_b +``` + +After outlining, the parent body becomes: + +```python +out_a, out_b = self.main_core_group_0(a, b, out) # multiple return values +x = out_a + out_b +``` + +## Implementation + +**Header**: `include/pypto/ir/transforms/passes.h` + +```cpp +Pass OutlineIncoreScopes(); +``` + +**Implementation**: `src/ir/transforms/outline_incore_scopes.cpp` + +- Uses the shared `scope_outline_utils` to compute inputs/outputs +- Builds a new `Function(InCore)` per `CORE_GROUP` scope +- Copies `role_` / `split_` metadata onto the outlined function's attrs +- Re-types the parent function from `Opaque` to `Orchestration` when at + least one `CORE_GROUP` scope was outlined out of it + +**Python binding**: `python/bindings/modules/passes.cpp` + +```cpp +passes.def("outline_incore_scopes", &pass::OutlineIncoreScopes, + "Outline CORE_GROUP HierarchyScopeStmt regions into Function(InCore) " + "and promote the parent function to Orchestration"); +``` + +**Tests**: `tests/ut/ir/transforms/test_outline_incore_scopes.py` + +- Tests `CORE_GROUP` scope → `InCore` function + parent `Orchestration` +- Tests `split_` propagation onto the outlined InCore function +- Tests input/output analysis +- Tests multiple `CORE_GROUP` scopes in the same parent function +- Tests SSA preservation + +## Pass Properties + +| Property | Value | +| -------- | ----- | +| Required | `SSAForm` | +| Produced | `SSAForm`, `HierarchyOutlined` | +| Invalidated | — | + +`HierarchyOutlined` is produced here (not by +[`OutlineHierarchyScopes`](05-outline_hierarchy_scopes.md)): after both +outline passes have run, no `HierarchyScopeStmt` nodes remain in +`Opaque`/`Orchestration` functions. + +## Pipeline Position + +```text +... → ConvertToSSA → NormalizeStmtStructure → FlattenCallExpr → +OutlineHierarchyScopes → OutlineIncoreScopes → OutlineClusterScopes → +ConvertTensorToTileOps → ... +``` diff --git a/docs/en/dev/passes/08-outline_cluster_scopes.md b/docs/en/dev/passes/07-outline_cluster_scopes.md similarity index 72% rename from docs/en/dev/passes/08-outline_cluster_scopes.md rename to docs/en/dev/passes/07-outline_cluster_scopes.md index c4cd839cb..c16ff83f2 100644 --- a/docs/en/dev/passes/08-outline_cluster_scopes.md +++ b/docs/en/dev/passes/07-outline_cluster_scopes.md @@ -11,7 +11,7 @@ This pass transforms `ClusterScopeStmt` nodes into separate `Function(Group)` de - Input IR must be in SSA form (run ConvertToSSA first) - Only processes Opaque and Orchestration functions -**When to use**: Run after `OutlineIncoreScopes` when the IR contains `with pl.cluster():` scopes or standalone `with pl.spmd(...):` scopes that need to be extracted into wrapper functions. +**When to use**: Run after `OutlineHierarchyScopes` and `OutlineIncoreScopes` when the IR contains `with pl.cluster():` scopes or standalone `with pl.spmd(...):` scopes that need to be extracted into wrapper functions. The cluster body may still contain calls to `Function(InCore)` produced earlier by `OutlineIncoreScopes`. ## API @@ -42,7 +42,9 @@ program_outlined = outline_pass(program) ## Example -**Before**: +**Before** (assume `OutlineIncoreScopes` has already turned the inner +`with pl.at(level=pl.Level.CORE_GROUP): ...` scope into a call to an outlined +`Function(InCore)` named `main_core_group_0`): ```python @pl.program @@ -50,8 +52,7 @@ class Before: @pl.function def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: with pl.cluster(): - with pl.incore(): - y: pl.Tensor[[64], pl.FP32] = pl.add(x, x) + y: pl.Tensor[[64], pl.FP32] = self.main_core_group_0(x) return y ``` @@ -62,8 +63,7 @@ class Before: class After: @pl.function(type=pl.FunctionType.Group) def main_cluster_0(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.incore(): - y: pl.Tensor[[64], pl.FP32] = pl.add(x, x) + y: pl.Tensor[[64], pl.FP32] = self.main_core_group_0(x) return y @pl.function @@ -72,7 +72,10 @@ class After: return y ``` -Note: InCore scopes inside the Cluster are preserved in the outlined Group function. Run `OutlineIncoreScopes` first to outline InCore scopes before clustering, or after to outline them within Group functions. +Note: `OutlineHierarchyScopes` and `OutlineIncoreScopes` run before this +pass, so the cluster body already contains calls to `Function(InCore)` +rather than inline `HierarchyScopeStmt` nodes. The outlined Group function +preserves those calls. ## Standalone Spmd Example @@ -133,12 +136,12 @@ class After: | Produced | SSAForm, ClusterOutlined | | Invalidated | — | -## Relationship to OutlineIncoreScopes +## Relationship to OutlineHierarchyScopes / OutlineIncoreScopes -| Aspect | OutlineIncoreScopes | OutlineClusterScopes | -| ------ | ------------------- | -------------------- | -| Scope kind | `ScopeKind::InCore` | `ScopeKind::Cluster` / standalone `ScopeKind::Spmd` | -| Output function type | `FunctionType::InCore` | `FunctionType::Group` / `FunctionType::Spmd` | -| Naming pattern | `{func}_incore_{n}` | `{func}_cluster_{n}` / `{func}_spmd_{n}` | -| Promotes parent to | Orchestration | *(unchanged)* | -| Processes | Opaque functions only | Opaque + Orchestration | +| Aspect | OutlineHierarchyScopes | OutlineIncoreScopes | OutlineClusterScopes | +| ------ | ---------------------- | ------------------- | -------------------- | +| Scope kind | `HierarchyScopeStmt` (non-CORE_GROUP) | `HierarchyScopeStmt` (CORE_GROUP) | `ClusterScopeStmt` / standalone `SpmdScopeStmt` | +| Output function type | `FunctionType::Opaque` | `FunctionType::InCore` | `FunctionType::Group` / `FunctionType::Spmd` | +| Naming pattern | `{func}_{level}_{n}` | `{func}_core_group_{n}` | `{func}_cluster_{n}` / `{func}_spmd_{n}` | +| Promotes parent to | *(unchanged)* | `Orchestration` | *(unchanged)* | +| Processes | `Opaque` functions only | `Opaque` functions only | `Opaque` + `Orchestration` | diff --git a/docs/en/dev/passes/07-outline_incore_scopes.md b/docs/en/dev/passes/07-outline_incore_scopes.md deleted file mode 100644 index 269df4d50..000000000 --- a/docs/en/dev/passes/07-outline_incore_scopes.md +++ /dev/null @@ -1,173 +0,0 @@ -# OutlineIncoreScopes Pass - -Outlines InCore scopes into separate functions. - -## Overview - -This pass transforms `InCoreScopeStmt` nodes into separate `Function(InCore)` definitions and replaces the scope with a Call to the outlined function. - -**Requirements**: - -- Input IR must be in SSA form (run ConvertToSSA first); SSAForm is preserved (produced) by this pass -- Only processes Opaque functions (InCore functions are left unchanged) - -**When to use**: Run after ConvertToSSA when you need to extract InCore computation regions into separate callable functions. - -## API - -| C++ | Python | Level | -| --- | ------ | ----- | -| `pass::OutlineIncoreScopes()` | `passes.outline_incore_scopes()` | Program-level | - -**Factory function**: - -```cpp -Pass OutlineIncoreScopes(); -``` - -**Python usage**: - -```python -from pypto.pypto_core import passes - -outline_pass = passes.outline_incore_scopes() -program_outlined = outline_pass(program) -``` - -## Algorithm - -1. **Scan for InCore Scopes**: Find all `InCoreScopeStmt` nodes in Opaque functions -2. **Analyze Inputs**: Determine external variable references (variables defined outside scope, used inside) -3. **Analyze Outputs**: Determine internal definitions used after scope (variables defined inside, used outside) -4. **Create Function**: Extract scope body into new `Function(scope_type=InCore)` with: - - Parameters = input variables - - Returns = output variables - - Body = scope body -5. **Replace Scope**: Replace `InCoreScopeStmt` with: - - Call to outlined function with input arguments - - AssignStmt for each output variable -6. **Add to Program**: Add outlined function to program's function list - -**Naming**: - -- Default: `{original_func}_incore_{counter}` (e.g., `main_incore_0`, `main_incore_1`) -- User-provided: when `InCoreScopeStmt.name_hint` is non-empty, that name is used directly - - `with pl.incore(name_hint="fused_add"):` → function named `fused_add` - -## Example - -### Basic Outlining - -**Before**: - -```python -@pl.program -class Before: - @pl.function # Opaque function - def main(self, x: Tensor[[64], FP32]) -> Tensor[[64], FP32]: - y = x + 1 - - with pl.incore(): # InCore scope - tile = pl.load(y, [0], [64]) - tile_sq = pl.mul(tile, tile) - result_tile = tile_sq + 1 - result = pl.store(result_tile, [0], x) - - z = result + 2 - return z -``` - -**After**: - -```python -@pl.program -class After: - @pl.function # Opaque function - def main(self, x: Tensor[[64], FP32]) -> Tensor[[64], FP32]: - y = x + 1 - - # Scope replaced with call + assignments - result = self.main_incore_0(y, x) # Call outlined function - - z = result + 2 - return z - - @pl.function(scope_type=InCore) # Outlined InCore function - def main_incore_0(self, y: Tensor[[64], FP32], x: Tensor[[64], FP32]) -> Tensor[[64], FP32]: - # Scope body moved here - tile = pl.load(y, [0], [64]) - tile_sq = pl.mul(tile, tile) - result_tile = tile_sq + 1 - result = pl.store(result_tile, [0], x) - return result -``` - -### Multiple Outputs - -**Before**: - -```python -with pl.incore(): - a_tile = pl.load(a, [0], [64]) - b_tile = pl.load(b, [0], [64]) - c_tile = pl.add(a_tile, b_tile) - out_a = pl.store(c_tile, [0], out) - out_b = pl.mul(c_tile, 2.0) -# Both out_a and out_b used after scope -x = out_a + out_b -``` - -**After**: - -```python -out_a, out_b = self.main_incore_0(a, b, out) # Multiple outputs -x = out_a + out_b - -# Outlined function: -def main_incore_0(self, a, b, out): - a_tile = pl.load(a, [0], [64]) - b_tile = pl.load(b, [0], [64]) - c_tile = pl.add(a_tile, b_tile) - out_a = pl.store(c_tile, [0], out) - out_b = pl.mul(c_tile, 2.0) - return (out_a, out_b) -``` - -## Implementation - -**Header**: `include/pypto/ir/transforms/passes.h` - -```cpp -Pass OutlineIncoreScopes(); -``` - -**Implementation**: `src/ir/transforms/outline_incore_scopes.cpp` - -- Uses SSA analysis to determine inputs/outputs -- Creates new Function nodes with InCore scope type -- Replaces InCoreScopeStmt with Call + AssignStmt -- Manages function naming and counters - -**Python binding**: `python/bindings/modules/passes.cpp` - -```cpp -passes.def("outline_incore_scopes", &pass::OutlineIncoreScopes, "Outline InCore scopes"); -``` - -**Tests**: `tests/ut/ir/transforms/test_outline_incore_scopes.py` - -- Tests basic scope outlining -- Tests input/output analysis -- Tests multiple scopes in same function -- Tests nested scopes -- Tests SSA preservation - -## Requirements - -**SSA form required**: The pass relies on SSA properties: - -- Single assignment ensures clear input/output analysis -- No variable shadowing simplifies scope analysis -- YieldStmt in control flow handled correctly - -**Run ConvertToSSA first** if IR is not in SSA form. diff --git a/docs/en/dev/passes/09-convert_tensor_to_tile_ops.md b/docs/en/dev/passes/08-convert_tensor_to_tile_ops.md similarity index 90% rename from docs/en/dev/passes/09-convert_tensor_to_tile_ops.md rename to docs/en/dev/passes/08-convert_tensor_to_tile_ops.md index 1c069c3ca..243f575ed 100644 --- a/docs/en/dev/passes/09-convert_tensor_to_tile_ops.md +++ b/docs/en/dev/passes/08-convert_tensor_to_tile_ops.md @@ -4,14 +4,14 @@ Converts tensor operations to tile operations in InCore functions and updates or ## Overview -After `OutlineIncoreScopes` extracts InCore scopes into separate functions, those functions still operate on `TensorType` variables using `tensor.*` operations. This pass lowers them to `TileType` variables with `tile.*` operations that map directly to PTO-ISA instructions. +After `OutlineHierarchyScopes` and `OutlineIncoreScopes` extract `HierarchyScopeStmt` regions into separate functions (with `OutlineIncoreScopes` producing `Function(InCore)` for `CORE_GROUP` scopes), those InCore functions still operate on `TensorType` variables using `tensor.*` operations. This pass lowers them to `TileType` variables with `tile.*` operations that map directly to PTO-ISA instructions. The pass also updates call sites in orchestration/opaque functions: for each new output parameter added to an InCore function, a `tensor.create` is inserted at the call site. **Requirements**: - Input IR must be in SSA form -- InCore scopes must be outlined (run `OutlineIncoreScopes` first) +- Hierarchy scopes must be outlined into functions (run `OutlineHierarchyScopes` and `OutlineIncoreScopes` first) - Statement structure must be normalized **When to use**: Run after `OutlineClusterScopes` and before `OptimizeOrchTensors`. @@ -119,7 +119,7 @@ Key changes: | Property | Value | | -------- | ----- | -| Required | SSAForm, SplitIncoreOrch, NormalizedStmtStructure | +| Required | SSAForm, HierarchyOutlined, NormalizedStmtStructure | | Produced | SSAForm, IncoreTileOps, NormalizedStmtStructure | | Invalidated | — | diff --git a/docs/en/dev/passes/10-optimize_orch_tensors.md b/docs/en/dev/passes/09-optimize_orch_tensors.md similarity index 98% rename from docs/en/dev/passes/10-optimize_orch_tensors.md rename to docs/en/dev/passes/09-optimize_orch_tensors.md index b80645abc..616abf3d7 100644 --- a/docs/en/dev/passes/10-optimize_orch_tensors.md +++ b/docs/en/dev/passes/09-optimize_orch_tensors.md @@ -132,8 +132,8 @@ The `tensor.create` is eliminated; the iter-arg buffer is reused across iteratio | Property | Value | | -------- | ----- | -| Required | SplitIncoreOrch, IncoreTileOps | -| Produced | SplitIncoreOrch, IncoreTileOps | +| Required | HierarchyOutlined, IncoreTileOps | +| Produced | HierarchyOutlined, IncoreTileOps | | Invalidated | — | ## Key Components diff --git a/docs/en/dev/passes/11-flatten_tile_nd_to_2d.md b/docs/en/dev/passes/10-flatten_tile_nd_to_2d.md similarity index 100% rename from docs/en/dev/passes/11-flatten_tile_nd_to_2d.md rename to docs/en/dev/passes/10-flatten_tile_nd_to_2d.md diff --git a/docs/en/dev/passes/14-expand_mixed_kernel.md b/docs/en/dev/passes/11-expand_mixed_kernel.md similarity index 96% rename from docs/en/dev/passes/14-expand_mixed_kernel.md rename to docs/en/dev/passes/11-expand_mixed_kernel.md index d5ce000c9..5f245b0b8 100644 --- a/docs/en/dev/passes/14-expand_mixed_kernel.md +++ b/docs/en/dev/passes/11-expand_mixed_kernel.md @@ -4,7 +4,7 @@ Expands mixed InCore functions into separate AIC (Cube) + AIV (Vector) kernels w ## Overview -After `OutlineIncoreScopes` and `ConvertTensorToTileOps`, InCore functions may contain both Cube ops (`tile.matmul`, `tile.gemv`, etc.) and Vector ops (`tile.add`, `tile.exp`, etc.). Some ops like `tile.load`, `tile.store`, `tile.move`, and `tile.reshape` are classified as Cube or Vector based on the MemorySpace of their tile operands. Functions containing ops from both sides are **mixed InCore functions**. Hardware requires Cube and Vector operations to run on separate core types, so this pass splits them into: +After `OutlineHierarchyScopes` and `ConvertTensorToTileOps`, InCore functions may contain both Cube ops (`tile.matmul`, `tile.gemv`, etc.) and Vector ops (`tile.add`, `tile.exp`, etc.). Some ops like `tile.load`, `tile.store`, `tile.move`, and `tile.reshape` are classified as Cube or Vector based on the MemorySpace of their tile operands. Functions containing ops from both sides are **mixed InCore functions**. Hardware requires Cube and Vector operations to run on separate core types, so this pass splits them into: - **AIC function** (`FunctionType::AIC`) — contains only Cube + shared ops - **AIV function** (`FunctionType::AIV`) — contains only Vector + shared ops @@ -76,7 +76,7 @@ For consumer-side cross-core tiles, the pass also normalizes statement order to **Requirements**: - Input IR must have tile ops (run `ConvertTensorToTileOps` first) -- Input IR must have InCore scopes outlined (run `OutlineIncoreScopes` first) +- Input IR must have hierarchy scopes outlined into functions (run `OutlineHierarchyScopes` first) - Tile ops must be flattened to 2D (run `FlattenTileNdTo2D` first) - Tile memory space must be inferred (run `InferTileMemorySpace` first) - Cross-core fractal TileView assignment is supported on Ascend950 and Ascend910B backends @@ -292,7 +292,7 @@ class After: | Property | Value | | -------- | ----- | -| Required | SSAForm, IncoreTileOps, SplitIncoreOrch, TileOps2D, TileMemoryInferred | +| Required | SSAForm, IncoreTileOps, HierarchyOutlined, TileOps2D, TileMemoryInferred | | Produced | SSAForm, MixedKernelExpanded | | Invalidated | — | diff --git a/docs/en/dev/passes/15-init_memref.md b/docs/en/dev/passes/12-init_memref.md similarity index 98% rename from docs/en/dev/passes/15-init_memref.md rename to docs/en/dev/passes/12-init_memref.md index de54b5100..e37b8b381 100644 --- a/docs/en/dev/passes/15-init_memref.md +++ b/docs/en/dev/passes/12-init_memref.md @@ -12,7 +12,7 @@ This pass performs three tasks: Memory space is read from `TileType::memory_space_` (set by InferTileMemorySpace). Variables without `memory_space` default to DDR. -**Requires**: SSAForm, SplitIncoreOrch, IncoreTileOps, TileOps2D, TileMemoryInferred. +**Requires**: SSAForm, HierarchyOutlined, IncoreTileOps, TileOps2D, TileMemoryInferred. **Produces**: HasMemRefs, NormalizedStmtStructure. diff --git a/docs/en/dev/passes/16-memory_reuse.md b/docs/en/dev/passes/13-memory_reuse.md similarity index 100% rename from docs/en/dev/passes/16-memory_reuse.md rename to docs/en/dev/passes/13-memory_reuse.md diff --git a/docs/en/dev/passes/17-allocate_memory_addr.md b/docs/en/dev/passes/14-allocate_memory_addr.md similarity index 100% rename from docs/en/dev/passes/17-allocate_memory_addr.md rename to docs/en/dev/passes/14-allocate_memory_addr.md diff --git a/docs/en/dev/passes/20-partial_unroll_tile_loops.md b/docs/en/dev/passes/15-partial_unroll_tile_loops.md similarity index 97% rename from docs/en/dev/passes/20-partial_unroll_tile_loops.md rename to docs/en/dev/passes/15-partial_unroll_tile_loops.md index 14f9d2786..b35ce7629 100644 --- a/docs/en/dev/passes/20-partial_unroll_tile_loops.md +++ b/docs/en/dev/passes/15-partial_unroll_tile_loops.md @@ -8,7 +8,7 @@ Lowers `pl.range(N, unroll=F)` at the tile level: replicates the loop body `F` t `PartialUnrollTileLoops` provides the targeted knob: replicate the body `F` times (typically 2–4) at the tile level, leaving an outer loop of `N/F` iterations. Each clone gets fresh def-vars (SSA preserved) and operates on independent tiles, which downstream `MemoryReuse` cannot merge. -**Requires**: SSAForm, SplitIncoreOrch, IncoreTileOps, TileOps2D, TileMemoryInferred, NormalizedStmtStructure. +**Requires**: SSAForm, HierarchyOutlined, IncoreTileOps, TileOps2D, TileMemoryInferred, NormalizedStmtStructure. **Pipeline position**: After `NormalizeReturnOrder`, before `InitMemRef` (slot 20.5). Late enough that all tile-structural decisions are made; early enough that `InitMemRef`/`MemoryReuse` see distinct tile vars per clone. @@ -157,6 +157,6 @@ Every main-loop iteration AND every tail branch carries the `unroll_replicated` ## Related -- [`ReorderUnrolledIO`](21-reorder_unrolled_io.md) — consumes the `unroll_replicated` marker +- [`ReorderUnrolledIO`](16-reorder_unrolled_io.md) — consumes the `unroll_replicated` marker - [`UnrollLoops`](01-unroll_loops.md) — full-unroll pass at slot #1, kept as the primary `pl.unroll(N)` lowering - RFC #1025 — design document diff --git a/docs/en/dev/passes/21-reorder_unrolled_io.md b/docs/en/dev/passes/16-reorder_unrolled_io.md similarity index 95% rename from docs/en/dev/passes/21-reorder_unrolled_io.md rename to docs/en/dev/passes/16-reorder_unrolled_io.md index 6b642e00c..4ade4e2d9 100644 --- a/docs/en/dev/passes/21-reorder_unrolled_io.md +++ b/docs/en/dev/passes/16-reorder_unrolled_io.md @@ -14,7 +14,7 @@ This pass reorders each marked `SeqStmts` so: The result is `[loads…, compute…, stores…]` whenever the dataflow allows. Sibling clones' input tiles are co-live near the top, output tiles co-live near the bottom — `MemoryReuse` cannot coalesce them, so each clone keeps its own MemRef and ping-pong buffering becomes possible. -**Requires**: SSAForm, SplitIncoreOrch, IncoreTileOps, TileOps2D, TileMemoryInferred, NormalizedStmtStructure. +**Requires**: SSAForm, HierarchyOutlined, IncoreTileOps, TileOps2D, TileMemoryInferred, NormalizedStmtStructure. **Pipeline position**: After `PartialUnrollTileLoops`, before `InitMemRef` (slot 20.6). Running before `InitMemRef` keeps SSAForm intact for the dependency analysis. @@ -112,7 +112,7 @@ All four `tile_x_k` are now co-live up to the last load, and all four `tile_y_k` ## Related -- [`PartialUnrollTileLoops`](20-partial_unroll_tile_loops.md) — produces the `unroll_replicated` marker this pass consumes -- [`MemoryReuse`](16-memory_reuse.md) — runs after this pass; benefits from the co-live tiles +- [`PartialUnrollTileLoops`](15-partial_unroll_tile_loops.md) — produces the `unroll_replicated` marker this pass consumes +- [`MemoryReuse`](13-memory_reuse.md) — runs after this pass; benefits from the co-live tiles - RFC #1025 — design document - RFC #1026 / PR #1029 — InOut-use discipline + dependency analysis utility diff --git a/docs/en/dev/passes/99-verifier.md b/docs/en/dev/passes/99-verifier.md index 79c68238e..89a55b0b1 100644 --- a/docs/en/dev/passes/99-verifier.md +++ b/docs/en/dev/passes/99-verifier.md @@ -15,7 +15,7 @@ Extensible verification system for validating PyPTO IR correctness through plugg - **Pluggable Rule System**: Extend with custom verification rules - **Property-Based Verification**: Opt-in property sets — verify exactly what you need -- **Structural Properties**: TypeChecked, BreakContinueValid, NoRedundantBlocks, UseAfterDef, OutParamNotShadowed, NoNestedInCore, and InOutUseValid are verified at pipeline start by `PassPipeline` and before/after each pass by `VerificationInstrument` +- **Structural Properties**: TypeChecked, BreakContinueValid, NoRedundantBlocks, UseAfterDef, OutParamNotShadowed, and InOutUseValid are verified at pipeline start by `PassPipeline` and before/after each pass by `VerificationInstrument` - **Dual Verification Modes**: Collect diagnostics or throw on first error - **Pass Integration**: Use as a Pass in optimization pipelines - **Comprehensive Diagnostics**: Collect all issues with source locations @@ -26,10 +26,10 @@ Extensible verification system for validating PyPTO IR correctness through plugg | Category | Examples | Behavior | | -------- | -------- | -------- | -| **Structural** | TypeChecked, BreakContinueValid, NoRedundantBlocks, UseAfterDef, OutParamNotShadowed, NoNestedInCore, InOutUseValid | Always true. Verified at pipeline start and before/after each pass by `VerificationInstrument`. Never in PassProperties. | +| **Structural** | TypeChecked, BreakContinueValid, NoRedundantBlocks, UseAfterDef, OutParamNotShadowed, InOutUseValid | Always true. Verified at pipeline start and before/after each pass by `VerificationInstrument`. Never in PassProperties. | | **Pipeline** | SSAForm, NoNestedCalls, HasMemRefs, ... | Produced/invalidated by passes. Verified per pass-declared contracts. | -`GetStructuralProperties()` returns `{TypeChecked, BreakContinueValid, NoRedundantBlocks, UseAfterDef, OutParamNotShadowed, NoNestedInCore, InOutUseValid}`. These are verified **at pipeline start** by `PassPipeline::Run()` and **before/after each pass** by `VerificationInstrument`. Since no pass declares them in `required`/`produced`/`invalidated`, `VerificationInstrument` unions them with the pass's declared properties to ensure no pass breaks these fundamental invariants. +`GetStructuralProperties()` returns `{TypeChecked, BreakContinueValid, NoRedundantBlocks, UseAfterDef, OutParamNotShadowed, InOutUseValid}`. These are verified **at pipeline start** by `PassPipeline::Run()` and **before/after each pass** by `VerificationInstrument`. Since no pass declares them in `required`/`produced`/`invalidated`, `VerificationInstrument` unions them with the pass's declared properties to ensure no pass breaks these fundamental invariants. ### Verification Rule System @@ -68,12 +68,11 @@ The `run_verifier()` utility creates a standalone `Pass` for ad-hoc use in custo | **UseAfterDefCheck** | UseAfterDef | Every Var use dominated by a definition (param, AssignStmt, loop var, iter_arg, return_var) | | **NormalizedStmtStructure** | NormalizedStmtStructure | Nested `SeqStmts` flattened and single-child `SeqStmts` unwrapped | | **NoRedundantBlocks** | NoRedundantBlocks | No single-child or nested `SeqStmts` | -| **SplitIncoreOrch** | SplitIncoreOrch | No `InCoreScopeStmt` nodes remain in Opaque functions | +| **HierarchyOutlined** | HierarchyOutlined | No `HierarchyScopeStmt` nodes remain in `Opaque` or `Orchestration` functions | | **IncoreTileOps** | IncoreTileOps | InCore functions use tile ops (no tensor-level ops remain) | | **HasMemRefs** | HasMemRefs | All TileType variables have MemRef initialized | | **AllocatedMemoryAddr** | AllocatedMemoryAddr | All MemRefs have valid addresses within buffer limits | | **OutParamNotShadowed** | OutParamNotShadowed | Out/InOut params not reassigned with tensor-creating ops | -| **NoNestedInCore** | NoNestedInCore | No nested InCore scopes (`InCoreScopeStmt` inside `InCoreScopeStmt`) | | **InOutUseValid** | InOutUseValid | Variables passed as InOut/Out to user-function calls are not read after the call (RFC #1026). Group-typed function bodies are skipped pending follow-up. | ### SSAVerify @@ -161,8 +160,8 @@ Singleton registry mapping `IRProperty` values to `PropertyVerifier` factories. | Function | Returns | Description | | -------- | ------- | ----------- | -| `GetStructuralProperties()` | `{TypeChecked, BreakContinueValid, NoRedundantBlocks, UseAfterDef, OutParamNotShadowed, NoNestedInCore, InOutUseValid}` | Invariants verified at pipeline start and before/after each pass | -| `GetDefaultVerifyProperties()` | `{SSAForm, TypeChecked, NoNestedCalls, BreakContinueValid, NoRedundantBlocks, UseAfterDef, OutParamNotShadowed, NoNestedInCore}` | Default set for `run_verifier()` | +| `GetStructuralProperties()` | `{TypeChecked, BreakContinueValid, NoRedundantBlocks, UseAfterDef, OutParamNotShadowed, InOutUseValid}` | Invariants verified at pipeline start and before/after each pass | +| `GetDefaultVerifyProperties()` | `{SSAForm, TypeChecked, NoNestedCalls, BreakContinueValid, NoRedundantBlocks, UseAfterDef, OutParamNotShadowed}` | Default set for `run_verifier()` | | `GetVerifiedProperties()` | `{SSAForm, TypeChecked, AllocatedMemoryAddr, BreakContinueValid, NoRedundantBlocks, InOutUseValid}` | Lightweight set for `PassPipeline` auto-verify | ### RunVerifier Pass Factory diff --git a/docs/en/user/01-language_guide.md b/docs/en/user/01-language_guide.md index 6ce6b77ba..acc68f7b4 100644 --- a/docs/en/user/01-language_guide.md +++ b/docs/en/user/01-language_guide.md @@ -410,50 +410,23 @@ class Model: Mark a code region as InCore execution without making a separate function: ```python -# Preferred (new API): with pl.at(level=pl.Level.CORE_GROUP): y: pl.Tensor[[64], pl.FP32] = pl.add(x, x) - -# Deprecated (use pl.at instead): -with pl.incore(): - y: pl.Tensor[[64], pl.FP32] = pl.add(x, x) ``` -For compiler-driven chunked loop outlining (AutoInCore), pass `pl.auto_chunk` in -the `optimizations` list: +`OutlineIncoreScopes` later extracts this region into a +`Function(InCore)` and re-types the parent `Opaque` function as +`Orchestration`. (Non-CORE_GROUP `pl.at(level=...)` regions are extracted +by the preceding `OutlineHierarchyScopes` pass into `Function(Opaque)`, +without parent promotion.) -```python -# Preferred (new API): -with pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.auto_chunk]): - for i in pl.parallel(0, 8, 1, chunk=4): - x = pl.add(x, x) - -# Deprecated (still works, emits DeprecationWarning): -with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - ... - -with pl.auto_incore(): - ... -``` - -To set a cross-core split mode (consumed by the `ExpandMixedKernel` pass), use -`pl.split(...)` — independent from `pl.auto_chunk`, so the two can be combined: +To set a cross-core split mode (consumed by the `ExpandMixedKernel` pass), +pass `pl.split(...)` in `optimizations`: ```python -# Plain InCore + split hint: with pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.split(pl.SplitMode.UP_DOWN)]): y: pl.Tensor[[64], pl.FP32] = pl.add(x, x) - -# AutoInCore + split hint (independent entries, combined freely): -with pl.at(level=pl.Level.CORE_GROUP, - optimizations=[pl.auto_chunk, pl.split(pl.SplitMode.UP_DOWN)]): - for i in pl.parallel(0, 8, 1, chunk=4): - x = pl.add(x, x) - -# Deprecated single-kwarg form (still works, emits DeprecationWarning): -with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN): - ... ``` ## Memory and Data Movement @@ -541,22 +514,28 @@ The `Default` strategy runs these passes in order: 1. **UnrollLoops** — unroll loop iterations 2. **CtrlFlowTransform** — rewrite control flow to structured IR 3. **ConvertToSSA** — convert to static single assignment form -4. **FlattenCallExpr** — flatten nested function calls -5. **SplitChunkedLoops** — split chunked loops into separate loops -6. **InterchangeChunkLoops** — interchange chunk loop ordering -7. **OutlineHierarchyScopes** — outline hierarchy scopes -8. **OutlineIncoreScopes** — outline InCore scopes into separate functions -9. **OutlineClusterScopes** — outline cluster scopes -10. **ConvertTensorToTileOps** — convert tensor operations to tile operations +4. **NormalizeStmtStructure** — flatten/unwrap redundant `SeqStmts` +5. **FlattenCallExpr** — flatten nested function calls +6. **OutlineHierarchyScopes** — outline non-CORE_GROUP `HierarchyScopeStmt` regions into `Function(Opaque)` +7. **OutlineIncoreScopes** — outline CORE_GROUP `HierarchyScopeStmt` regions into `Function(InCore)`; promote parent to `Orchestration` +8. **OutlineClusterScopes** — outline cluster scopes into Group functions +9. **ConvertTensorToTileOps** — convert tensor operations to tile operations +10. **OptimizeOrchTensors** — optimize orchestration-level tensor ops 11. **FlattenTileNdTo2D** — normalize ND tile ops to 2D 12. **InferTileMemorySpace** — infer tile memory spaces 13. **ResolveTransposeLayout** — repair transpose layout handling 14. **ResolveBackendOpLayouts** — repair backend-constrained tile layouts 15. **ExpandMixedKernel** — split mixed kernels when needed -16. **InitMemRef** — assign memory spaces and insert buffer allocations -17. **MemoryReuse** — share buffers with non-overlapping lifetimes -18. **LegalizePTOBufferReuse** — legalize PTO buffer reuse patterns -19. **AllocateMemoryAddr** — assign concrete memory addresses +16. **SplitVectorKernel** — split vector kernels when needed +17. **NormalizeReturnOrder** — reorder returns to match Out/InOut params +18. **PartialUnrollTileLoops** — partially unroll tile-level loops +19. **ReorderUnrolledIO** — group loads/stores of unrolled clones +20. **InitMemRef** — assign memory spaces and insert buffer allocations +21. **MemoryReuse** — share buffers with non-overlapping lifetimes +22. **LegalizePTOBufferReuse** — legalize PTO buffer reuse patterns +23. **AllocateMemoryAddr** — assign concrete memory addresses +24. **FuseCreateAssembleToSlice** — fuse create + assemble ops +25. **Simplify** — final simplification pass ### Debugging diff --git a/docs/en/user/02-operation_reference.md b/docs/en/user/02-operation_reference.md index 327e153b1..0fad072c9 100644 --- a/docs/en/user/02-operation_reference.md +++ b/docs/en/user/02-operation_reference.md @@ -215,6 +215,8 @@ Compare types: `EQ=0, NE=1, LT=2, LE=3, GT=4, GE=5` | `yield_` | `(*values: Any) -> Any \| tuple[Any, ...]` | Yield values from for/if scope | | `cond` | `(condition: bool \| Scalar) -> None` | Set while-loop condition (must be first statement) | | `const` | `(value: int \| float, dtype: DataType) -> int \| float` | Typed constant | -| `incore` | `() -> IncoreContext` | Context manager for InCore scope | +| `at` | `(*, level: Level, role: Role \| None = None, optimizations: Sequence[Optimization] \| None = None) -> AtContext` | Context manager for a hierarchy scope; `level=Level.CORE_GROUP` is the InCore form | +| `cluster` | `() -> ClusterContext` | Context manager for a cluster (AIC+AIV) scope | +| `spmd` | `(*, core_num: int \| Scalar, sync_start: bool = False) -> SpmdContext` | Context manager for a standalone SPMD launch scope | | `dynamic` | `(name: str) -> DynVar` | Create dynamic dimension variable | | `create_tensor` | `(shape: Sequence[IntLike], dtype: DataType, layout: TensorLayout = None) -> Tensor` | Create tensor (promoted from `pl.tensor`) | diff --git a/docs/zh-cn/dev/ir/01-hierarchy.md b/docs/zh-cn/dev/ir/01-hierarchy.md index 218c7969b..b8068b3b1 100644 --- a/docs/zh-cn/dev/ir/01-hierarchy.md +++ b/docs/zh-cn/dev/ir/01-hierarchy.md @@ -32,7 +32,12 @@ ::= "return" [ ] ::= ::= { ";" } - ::= "with" "pl.incore" "(" ")" ":" + ::= "with" "pl.at" "(" "level" "=" [ "," "role" "=" ] + [ "," "optimizations" "=" "[" "]" ] ")" + ":" + | "with" "pl.cluster" "(" ")" ":" + | "with" "pl.spmd" "(" "core_num" "=" + [ "," "sync_start" "=" ] ")" ":" ::= "break" ::= "continue" @@ -152,10 +157,8 @@ for_stmt = ir.ForStmt(i, start, stop, step, [sum_iter], body, [sum_final], span) | **IfStmt** | `condition_`, `then_stmts_`, `else_stmts_`, `return_vars_` | 条件分支 | | **ForStmt** | `loop_var_` (DefField), `start_`, `stop_`, `step_`, `iter_args_` (DefField), `body_`, `return_vars_` (DefField), `kind_` | 带可选迭代参数的 for 循环 | | **WhileStmt** | `condition_`, `iter_args_` (DefField), `body_`, `return_vars_` (DefField) | 带条件和迭代参数的 while 循环 | -| **InCoreScopeStmt** | `name_hint_`, `body_`, `split_`(可选) | InCore 区域;由 `OutlineIncoreScopes` 提取为 `Function(InCore)` | -| **AutoInCoreScopeStmt** | `name_hint_`, `body_`, `split_`(可选) | Auto-InCore 区域;由 `InterchangeChunkLoops` 消费 | | **ClusterScopeStmt** | `name_hint_`, `body_` | Cluster 区域;由 `OutlineClusterScopes` 提取为 `Function(Group)` | -| **HierarchyScopeStmt** | `name_hint_`, `body_`, `level_`, `role_`(可选) | 给定 Level/Role 的流水线阶段区域 | +| **HierarchyScopeStmt** | `name_hint_`, `body_`, `level_`, `role_`(可选), `split_`(可选) | 给定 Level/Role 的流水线阶段区域;当 `level_ == CORE_GROUP` 时提取为 `Function(InCore)`,其他层级提取为 `Function(Opaque)` | | **SpmdScopeStmt** | `name_hint_`, `body_`, `core_num_`, `sync_start_` | SPMD 启动区域;提取为 `Function(Spmd)` | | **YieldStmt** | `values_` | 在循环迭代中产出值 | | **EvalStmt** | `expr_` | 为副作用求值表达式 | @@ -220,24 +223,19 @@ while_stmt = ir.WhileStmt(condition, [x_iter], body, [x_final], span) ### ScopeStmt 详细说明 -`ScopeStmt` 是一个**抽象基类**,用于标记具有特定执行上下文的区域。下列五个具体子类 +`ScopeStmt` 是一个**抽象基类**,用于标记具有特定执行上下文的区域。下列三个具体子类 各自只携带其类型有效的字段——非法组合在构造时即不可表达。在 `ScopeStmt` 类型的引用上, 可使用 `s.scope_kind`(C++ 中为 `s.GetScopeKind()`)来取回类型,或使用 -`isinstance(s, InCoreScopeStmt)` 在具体类型上分派。 +`isinstance(s, HierarchyScopeStmt)` 在具体类型上分派。 -五个子类共享公共基类字段 `name_hint_: str` 和 `body_: StmtPtr`。注意: -`pl.at(level=Level.CORE_GROUP)` 实际下沉到 `InCoreScopeStmt` / -`AutoInCoreScopeStmt`,而非 `HierarchyScopeStmt`——解析器会在 `CORE_GROUP` -拒绝 `role=`。`HierarchyScopeStmt` 仅用于非 `CORE_GROUP` 的层级 -(host、cluster、global),并不是 in-core 作用域的通用替代。 +三个子类共享公共基类字段 `name_hint_: str` 和 `body_: StmtPtr`。`pl.at(level=...)` +统一下沉到 `HierarchyScopeStmt`——包括 `level=Level.CORE_GROUP`,它会产生 +`level_ == CORE_GROUP` 且可选携带 `split_` 的 `HierarchyScopeStmt`。`OutlineIncoreScopes` +随后把该 `CORE_GROUP` 作用域提取为 `Function(InCore)`,并将其父 `Opaque` +函数升级为 `Orchestration`。非 `CORE_GROUP` 的 `HierarchyScopeStmt` 则由 +紧邻其前执行的 `OutlineHierarchyScopes` 提取为 `Function(Opaque)`。 ```python -# with pl.incore(): y = pl.add(x, x) -in_core = ir.InCoreScopeStmt(name_hint="", body=body, span=span) - -# with pl.auto_incore(): (split 可选) -auto = ir.AutoInCoreScopeStmt(name_hint="", body=body, span=span) - # with pl.cluster(): cluster = ir.ClusterScopeStmt(name_hint="", body=body, span=span) @@ -245,6 +243,12 @@ cluster = ir.ClusterScopeStmt(name_hint="", body=body, span=span) hier = ir.HierarchyScopeStmt(level=ir.Level.HOST, role=ir.Role.Worker, name_hint="", body=body, span=span) +# with pl.at(level=Level.CORE_GROUP, +# optimizations=[pl.split(pl.SplitMode.UP_DOWN)]): +hier_core = ir.HierarchyScopeStmt(level=ir.Level.CORE_GROUP, + split=ir.SplitMode.UP_DOWN, + name_hint="", body=body, span=span) + # with pl.spmd(core_num=8): spmd = ir.SpmdScopeStmt(core_num=8, sync_start=False, name_hint="", body=body, span=span) @@ -256,20 +260,31 @@ spmd = ir.SpmdScopeStmt(core_num=8, sync_start=False, (执行一次,线性执行)。 - 必填字段在构造时强制校验:`HierarchyScopeStmt.level_` 不可为空; `SpmdScopeStmt` 拒绝 `core_num <= 0`。 -- `InCoreScopeStmt` / `AutoInCoreScopeStmt` 已计划弃用;新代码应优先使用 - `HierarchyScopeStmt` 或其它将保留的子类。 +- `HierarchyScopeStmt.split_` 可选,且仅在 `Level.CORE_GROUP` 下有意义。 + 它会被复制到提取出的 `InCore` 函数 attrs 上,供 `ExpandMixedKernel` 读取。 - Pass 行为: - - `InterchangeChunkLoops` 消费 `AutoInCoreScopeStmt` - - `OutlineIncoreScopes` 将 `InCoreScopeStmt` 提取为 `Function(InCore)` + - `OutlineHierarchyScopes` 将每个非 `CORE_GROUP` 的 `HierarchyScopeStmt` + 提取为一个独立的 `FunctionType::Opaque` 函数,父函数类型保持不变。 + - `OutlineIncoreScopes`(紧随其后执行)将每个 `CORE_GROUP` + `HierarchyScopeStmt` 提取为一个独立的 `FunctionType::InCore` 函数。 + 包含至少一个 `CORE_GROUP` 作用域的父函数由 `Opaque` 升级为 + `Orchestration`。 - `OutlineClusterScopes` 将 `ClusterScopeStmt` 提取为 `Function(Group)`, - 将独立的 `SpmdScopeStmt` 提取为 `Function(Spmd)` - - `OutlineHierarchyScopes` 提取 `HierarchyScopeStmt` + 将独立的 `SpmdScopeStmt` 提取为 `Function(Spmd)`。 **变换示例:** ```python -# Before: with pl.incore(): y = pl.add(x, x); return y -# After: main_incore_0(x) -> y; main(x): y = main_incore_0(x); return y +# Before: +# def main(x): +# with pl.at(level=pl.Level.CORE_GROUP): +# y = pl.add(x, x) +# return y +# After: +# def main_core_group_0(x) -> y: ... # FunctionType.InCore +# def main(x) -> y: # FunctionType.Orchestration +# y = main_core_group_0(x) +# return y ``` **并行 for 循环 (ForKind):** @@ -411,7 +426,7 @@ add_func = program.get_function("add") # Access by name | **一元运算** | 5 | Abs, Neg, Not, BitNot, Cast | | **调用/访问** | 2 | Call, TupleGetItemExpr | | **操作** | 2 | Op, GlobalVar | -| **语句** | 15 | AssignStmt, IfStmt, ForStmt, WhileStmt, ReturnStmt, InCoreScopeStmt, AutoInCoreScopeStmt, ClusterScopeStmt, HierarchyScopeStmt, SpmdScopeStmt, YieldStmt, EvalStmt, SeqStmts, BreakStmt, ContinueStmt | +| **语句** | 13 | AssignStmt, IfStmt, ForStmt, WhileStmt, ReturnStmt, ClusterScopeStmt, HierarchyScopeStmt, SpmdScopeStmt, YieldStmt, EvalStmt, SeqStmts, BreakStmt, ContinueStmt | | **类型** | 6 | ScalarType, TensorType, TileType, TupleType, PipeType, UnknownType | | **函数** | 2 | Function, Program | diff --git a/docs/zh-cn/dev/language/00-python_syntax.md b/docs/zh-cn/dev/language/00-python_syntax.md index ae3e7d684..5987676b0 100644 --- a/docs/zh-cn/dev/language/00-python_syntax.md +++ b/docs/zh-cn/dev/language/00-python_syntax.md @@ -255,22 +255,17 @@ for i in pl.unroll(12, chunk=4): body_statements ``` -**要点:** `chunk=C` 将循环拆分为外层顺序循环和 `C` 次迭代的内层循环。内层循环保留原始类型 (Sequential/Parallel/Unroll)。`chunk` 不能与 `init_values` 一起使用,且 `chunk=` 循环只能出现在 `with pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.auto_chunk]):` 内;在该作用域外,parser 会直接报错。参见 [SplitChunkedLoops Pass](../passes/05-split_chunked_loops.md)。 +**要点:** `chunk=C` 将循环拆分为外层顺序循环和 `C` 次迭代的内层循环。内层循环保留原始类型 (Sequential/Parallel/Unroll)。`chunk` 不能与 `init_values` 一起使用。 ### 作用域上下文管理器 (Scope Context Managers) -| 形式 | Scope 类型 | 说明 | -| ---- | ---------- | ---- | -| `pl.at(level=pl.Level.CORE_GROUP)` | `InCore` | CORE_GROUP 级固定边界 outline | -| `pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.split(MODE)])` | `InCore` | InCore + 跨核 split 提示 | -| `pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.auto_chunk])` | `AutoInCore` | 编译器驱动的 chunked 循环 split | -| `pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.auto_chunk, pl.split(MODE)])` | `AutoInCore` | AutoInCore + split 提示(条目独立) | -| `pl.at(level=pl.Level.HOST)`(或任意非 `CORE_GROUP` 级别) | `Hierarchy` | 分布式层级作用域 | -| `pl.cluster()` | `Cluster` | AIC+AIV 协同调度组 | -| `pl.incore()` *(已弃用)* | `InCore` | 请改用 `pl.at(level=pl.Level.CORE_GROUP)` | -| `pl.auto_incore(split=...)` *(已弃用)* | `AutoInCore` | 请改用 `pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.auto_chunk, pl.split(...)])` | -| `pl.at(..., optimization=pl.chunked_loop_optimizer[(split=...)])` *(已弃用)* | `AutoInCore` | 请改用 `pl.at(..., optimizations=[pl.auto_chunk, pl.split(...)])` | -| `pl.at(..., split=...)` *(已弃用)* | `InCore` | 请改用 `pl.at(..., optimizations=[pl.split(...)])` | +| 形式 | 产生 | 说明 | +| ---- | ---- | ---- | +| `pl.at(level=pl.Level.CORE_GROUP)` | `HierarchyScopeStmt`(level=CORE_GROUP) | 由 `OutlineHierarchyScopes` 提取为 `Function(InCore)`;其父 `Opaque` 函数升级为 `Orchestration` | +| `pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.split(MODE)])` | `HierarchyScopeStmt`(level=CORE_GROUP, split=MODE) | 同上;split 提示随提取函数保留,由 `ExpandMixedKernel` 消费 | +| `pl.at(level=pl.Level.HOST)`(或任意非 `CORE_GROUP` 层级) | `HierarchyScopeStmt`(level=HOST/...) | 提取为 `Function(Opaque)`;父函数类型保持不变 | +| `pl.cluster()` | `ClusterScopeStmt` | 由 `OutlineClusterScopes` 提取为 `Function(Group)` | +| `pl.spmd(core_num=N[, sync_start=...])` | `SpmdScopeStmt` | 非 cluster 内的 standalone spmd 提取为 `Function(Spmd)`;若位于 cluster 内,属性会被合并到 Group 函数上 | 示例参见 [语言指南](../../user/01-language_guide.md#incore-作用域)。 diff --git a/docs/zh-cn/dev/passes/00-pass_manager.md b/docs/zh-cn/dev/passes/00-pass_manager.md index 26c2b2c7a..8ebee8d2a 100644 --- a/docs/zh-cn/dev/passes/00-pass_manager.md +++ b/docs/zh-cn/dev/passes/00-pass_manager.md @@ -33,7 +33,7 @@ | `NoNestedCalls` | 无嵌套调用表达式 (Expression) | | `NormalizedStmtStructure` | 语句 (Statement) 结构已规范化 | | `NoRedundantBlocks` | 无单子节点或嵌套的 SeqStmts | -| `SplitIncoreOrch` | InCore 作用域已提取为独立函数 | +| `HierarchyOutlined` | `HierarchyScopeStmt` 区域已提取为函数(非 CORE_GROUP 由 `OutlineHierarchyScopes` 提取为 `Opaque`;`CORE_GROUP` 由 `OutlineIncoreScopes` 提取为 `InCore`);当存在 `CORE_GROUP` 作用域被提取时,父函数由 `Opaque` 升级为 `Orchestration`。由 `OutlineIncoreScopes`(两个 outline Pass 中的后者)产生。 | | `ClusterOutlined` | Cluster 作用域已提取为 Group 函数 | | `HasMemRefs` | 变量上已初始化内存引用 (MemRef) 对象 | | `IncoreTileOps` | InCore 函数使用 tile 操作 | @@ -61,21 +61,20 @@ struct PassProperties { | UnrollLoops | TypeChecked | TypeChecked | — | | CtrlFlowTransform | TypeChecked | TypeChecked, StructuredCtrlFlow | — | | ConvertToSSA | TypeChecked | TypeChecked, SSAForm | NormalizedStmtStructure | -| FlattenCallExpr | SSAForm | SSAForm, NoNestedCalls | NormalizedStmtStructure | -| SplitChunkedLoops | TypeChecked, SSAForm | TypeChecked, SSAForm | — | -| InterchangeChunkLoops | TypeChecked, SSAForm | TypeChecked, SSAForm | — | | NormalizeStmtStructure | TypeChecked | TypeChecked, NormalizedStmtStructure | — | -| OutlineIncoreScopes | TypeChecked, SSAForm | SplitIncoreOrch | — | +| FlattenCallExpr | SSAForm | SSAForm, NoNestedCalls | NormalizedStmtStructure | +| OutlineHierarchyScopes | SSAForm | SSAForm | — | +| OutlineIncoreScopes | SSAForm | SSAForm, HierarchyOutlined | — | | OutlineClusterScopes | TypeChecked, SSAForm | ClusterOutlined | — | -| ConvertTensorToTileOps | SplitIncoreOrch | IncoreTileOps | — | +| ConvertTensorToTileOps | HierarchyOutlined | IncoreTileOps | — | | FlattenTileNdTo2D | SSAForm, IncoreTileOps | SSAForm, TileOps2D | — | -| ResolveBackendOpLayouts | SSAForm, IncoreTileOps, SplitIncoreOrch, TileOps2D | SSAForm, IncoreTileOps, SplitIncoreOrch, TileOps2D | NormalizedStmtStructure | -| ExpandMixedKernel | SSAForm, IncoreTileOps, SplitIncoreOrch, TileOps2D | SSAForm, MixedKernelExpanded | — | -| NormalizeReturnOrder | SplitIncoreOrch, IncoreTileOps | — | — | -| InitMemRef | TypeChecked, SSAForm, SplitIncoreOrch, IncoreTileOps, TileOps2D | HasMemRefs | SSAForm | -| MemoryReuse | TypeChecked, SplitIncoreOrch, IncoreTileOps, HasMemRefs, TileOps2D | — | — | -| InsertSync | TypeChecked, SplitIncoreOrch, IncoreTileOps, HasMemRefs, TileOps2D | — | — | -| AllocateMemoryAddr | TypeChecked, SplitIncoreOrch, IncoreTileOps, HasMemRefs, TileOps2D | AllocatedMemoryAddr | — | +| ResolveBackendOpLayouts | SSAForm, IncoreTileOps, HierarchyOutlined, TileOps2D | SSAForm, IncoreTileOps, HierarchyOutlined, TileOps2D | NormalizedStmtStructure | +| ExpandMixedKernel | SSAForm, IncoreTileOps, HierarchyOutlined, TileOps2D | SSAForm, MixedKernelExpanded | — | +| NormalizeReturnOrder | HierarchyOutlined, IncoreTileOps | — | — | +| InitMemRef | TypeChecked, SSAForm, HierarchyOutlined, IncoreTileOps, TileOps2D | HasMemRefs | SSAForm | +| MemoryReuse | TypeChecked, HierarchyOutlined, IncoreTileOps, HasMemRefs, TileOps2D | — | — | +| InsertSync | TypeChecked, HierarchyOutlined, IncoreTileOps, HasMemRefs, TileOps2D | — | — | +| AllocateMemoryAddr | TypeChecked, HierarchyOutlined, IncoreTileOps, HasMemRefs, TileOps2D | AllocatedMemoryAddr | — | | FuseCreateAssembleToSlice | — | — | — | | Simplify | — | — | — | diff --git a/docs/zh-cn/dev/passes/01-unroll_loops.md b/docs/zh-cn/dev/passes/01-unroll_loops.md index 5dd2217d4..57ddec901 100644 --- a/docs/zh-cn/dev/passes/01-unroll_loops.md +++ b/docs/zh-cn/dev/passes/01-unroll_loops.md @@ -77,10 +77,10 @@ class After: UnrollLoops 在 `Default` 和 `DebugTileOptimization` 中都只**运行一次**,位于控制流结构化之前: ```text -UnrollLoops → CtrlFlowTransform → ConvertToSSA → FlattenCallExpr → SplitChunkedLoops → InterchangeChunkLoops → OutlineIncoreScopes → ... +UnrollLoops → CtrlFlowTransform → ConvertToSSA → NormalizeStmtStructure → FlattenCallExpr → OutlineHierarchyScopes → OutlineIncoreScopes → OutlineClusterScopes → ... ``` -UnrollLoops 展开非分块的 `pl.unroll()` 循环(跳过分块展开循环,保留 `chunk` 供后续 `SplitChunkedLoops` 处理)。 +UnrollLoops 将 `pl.unroll()` 循环展开为其内联复制。 ## Pass 属性 diff --git a/docs/zh-cn/dev/passes/02-ctrl_flow_transform.md b/docs/zh-cn/dev/passes/02-ctrl_flow_transform.md index 690c97362..194c35af8 100644 --- a/docs/zh-cn/dev/passes/02-ctrl_flow_transform.md +++ b/docs/zh-cn/dev/passes/02-ctrl_flow_transform.md @@ -163,7 +163,7 @@ while i < n and not __break_0: CtrlFlowTransform 在 UnrollLoops 之后、ConvertToSSA 之前运行: ```text -UnrollLoops -> CtrlFlowTransform -> ConvertToSSA -> FlattenCallExpr -> SplitChunkedLoops -> ... +UnrollLoops -> CtrlFlowTransform -> ConvertToSSA -> NormalizeStmtStructure -> FlattenCallExpr -> OutlineHierarchyScopes -> ... ``` ## Pass 属性 diff --git a/docs/zh-cn/dev/passes/03-convert_to_ssa.md b/docs/zh-cn/dev/passes/03-convert_to_ssa.md index cd1a95c36..84123415d 100644 --- a/docs/zh-cn/dev/passes/03-convert_to_ssa.md +++ b/docs/zh-cn/dev/passes/03-convert_to_ssa.md @@ -13,7 +13,7 @@ **需要**:TypeChecked 属性 (Property)(需在运行本 Pass 之前已建立,可通过属性验证/`VerificationInstrument` 等机制检查)。 -**使用时机**:在任何需要 SSA 形式的优化或分析之前运行此 Pass(如 OutlineIncoreScopes、内存优化 Pass)。 +**使用时机**:在任何需要 SSA 形式的优化或分析之前运行此 Pass(如 OutlineHierarchyScopes、内存优化 Pass)。 ## API diff --git a/docs/zh-cn/dev/passes/05-outline_hierarchy_scopes.md b/docs/zh-cn/dev/passes/05-outline_hierarchy_scopes.md new file mode 100644 index 000000000..f42dbad27 --- /dev/null +++ b/docs/zh-cn/dev/passes/05-outline_hierarchy_scopes.md @@ -0,0 +1,191 @@ +# OutlineHierarchyScopes Pass + +将非 `CORE_GROUP` 的 `HierarchyScopeStmt` 区域提取为独立的 `Opaque` 函数, +并把作用域的 level/role 元信息带到提取出的函数上。 + +## 概述 + +该 Pass 把每个 `level_` 不为 `Level.CORE_GROUP` 的 `HierarchyScopeStmt` +变换为独立的 `Function` 定义,并将原作用域替换为对该函数的 `Call`。提取出 +的函数类型恒为 `FunctionType::Opaque`;父函数的类型保持不变。 + +| 作用域 `level_` | 本 Pass 是否处理 | 提取出的函数类型 | 父函数类型(Pass 后) | +| --------------- | ---------------- | ---------------- | --------------------- | +| `Level.HOST`、`Level.CLUSTER`、`Level.GLOBAL`、... | 是 | `FunctionType::Opaque` | 保持不变 | +| `Level.CORE_GROUP` | **否 —— 有意跳过** | *(由 [`OutlineIncoreScopes`](06-outline_incore_scopes.md) 处理)* | *(由下一个 Pass 提升为 `Orchestration`)* | + +`CORE_GROUP` 作用域在本 Pass 中被有意保留;紧接着执行的 +[`OutlineIncoreScopes`](06-outline_incore_scopes.md) 会把它们提取为 +`Function(InCore)` 并将父函数由 `Opaque` 提升为 `Orchestration`。 + +**前置条件**: + +- 输入 IR 必须为 SSA 形式(需先运行 `ConvertToSSA`)。本 Pass 保留 + (产生)SSA 形式。 +- 处理 `Opaque` 函数。已经为 `Orchestration`、`InCore`、`AIC`、`AIV`、 + `Group` 的函数保持不变。 + +**使用时机**:在 `ConvertToSSA`/`FlattenCallExpr` 之后运行,当 IR 中包含 +非 `CORE_GROUP` 层级的 `with pl.at(level=...):` 作用域需要提取为独立辅助 +函数时使用。 + +## API + +| C++ | Python | 级别 | +| --- | ------ | ---- | +| `pass::OutlineHierarchyScopes()` | `passes.outline_hierarchy_scopes()` | 程序级 | + +**工厂函数**: + +```cpp +Pass OutlineHierarchyScopes(); +``` + +**Python 用法**: + +```python +from pypto.pypto_core import passes + +outline_pass = passes.outline_hierarchy_scopes() +program_outlined = outline_pass(program) +``` + +## 算法 + +1. **扫描 Hierarchy 作用域**:在每个 `Opaque` 函数体中查找所有 `level_` + **不为** `CORE_GROUP` 的 `HierarchyScopeStmt` 节点。 +2. **分析输入/输出**:复用 scope_outline_utils 辅助工具计算外部定义、内部 + 使用的变量(输入)以及内部定义、外部使用的变量(输出)。 +3. **创建提取函数**:将作用域体提取为新的 `Function`: + - 参数 = 输入变量 + - 返回值 = 输出变量 + - 函数体 = 作用域体 + - `func_type_` = `Opaque` + - 将 `role_` 元信息复制到函数 attrs。 +4. **替换作用域**:将原 `HierarchyScopeStmt` 替换为对提取函数的 `Call` + + 绑定返回值的若干 `AssignStmt`。 +5. **保持父函数类型**:本 Pass 不修改父函数的 `func_type_`。对 + `CORE_GROUP` 作用域的父函数提升由 + [`OutlineIncoreScopes`](06-outline_incore_scopes.md) 负责。 +6. **加入程序**:将提取出的函数前置到程序的函数列表中。 + +**命名规则**:`{原函数名}_{level}_{计数器}`(例如 `main_host_0`、 +`main_global_0`)。若 `HierarchyScopeStmt.name_hint` 非空,则直接使用该 +name_hint。 + +## 示例 + +### 非 CORE_GROUP 层级(HOST) + +**之前**: + +```python +@pl.program +class Before: + @pl.function # Opaque + def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: + with pl.at(level=pl.Level.HOST): + y = helper(x) + return y +``` + +**之后**(父函数仍为 `Opaque`,提取函数也是 `Opaque`): + +```python +@pl.program +class After: + @pl.function # 未变 + def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: + y = self.main_host_0(x) + return y + + @pl.function # Opaque + def main_host_0(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: + y = helper(x) + return y +``` + +### 多输出 + +```python +with pl.at(level=pl.Level.HOST): + a_tile = pl.load(a, [0], [64]) + b_tile = pl.load(b, [0], [64]) + c_tile = pl.add(a_tile, b_tile) + out_a = pl.store(c_tile, [0], out) + out_b = pl.mul(c_tile, 2.0) +# out_a 与 out_b 都在作用域之后被使用 +x = out_a + out_b +``` + +提取后的函数体变为: + +```python +out_a, out_b = self.main_host_0(a, b, out) # 多返回值 +x = out_a + out_b +``` + +### CORE_GROUP 作用域会被跳过 + +```python +@pl.function # Opaque +def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: + with pl.at(level=pl.Level.CORE_GROUP): # <-- 本 Pass 不处理 + tile = pl.load(x, [0], [64]) + result = pl.store(tile, [0], x) + return result +``` + +本 Pass 会把上述 `CORE_GROUP` 作用域原样保留。下一个流水线 Pass +[`OutlineIncoreScopes`](06-outline_incore_scopes.md) 会把它提取为 +`Function(InCore)` 并把父函数提升为 `Orchestration`。 + +## 实现 + +**头文件**:`include/pypto/ir/transforms/passes.h` + +```cpp +Pass OutlineHierarchyScopes(); +``` + +**实现文件**:`src/ir/transforms/outline_hierarchy_scopes.cpp` + +- 使用公共 `scope_outline_utils` 计算输入/输出 +- 对每个非 `CORE_GROUP` 作用域构造新的 `Function(Opaque)` +- 将 `role_` 元信息复制到提取函数的 attrs +- 从不修改父函数的 `func_type_` + +**Python 绑定**:`python/bindings/modules/passes.cpp` + +```cpp +passes.def("outline_hierarchy_scopes", &pass::OutlineHierarchyScopes, + "Outline non-CORE_GROUP HierarchyScopeStmt regions into Opaque functions"); +``` + +**测试**:`tests/ut/ir/transforms/test_outline_hierarchy_scopes.py` + +- 测试非 `CORE_GROUP` 作用域 → `Opaque` 函数 + 父函数不变 +- 测试 `CORE_GROUP` 作用域保持原样不被处理 +- 测试输入/输出分析 +- 测试同一父函数中多个非 `CORE_GROUP` 作用域 +- 测试 SSA 保留 + +## Pass 属性 + +| 属性 | 值 | +| ---- | -- | +| 所需 | `SSAForm` | +| 产生 | `SSAForm` | +| 失效 | — | + +`HierarchyOutlined` 现由紧随其后的 +[`OutlineIncoreScopes`](06-outline_incore_scopes.md) Pass 产生,它负责 +处理剩余的 `CORE_GROUP` 作用域。 + +## 流水线位置 + +```text +... → ConvertToSSA → NormalizeStmtStructure → FlattenCallExpr → +OutlineHierarchyScopes → OutlineIncoreScopes → OutlineClusterScopes → +ConvertTensorToTileOps → ... +``` diff --git a/docs/zh-cn/dev/passes/05-split_chunked_loops.md b/docs/zh-cn/dev/passes/05-split_chunked_loops.md deleted file mode 100644 index 0dfd8b4ff..000000000 --- a/docs/zh-cn/dev/passes/05-split_chunked_loops.md +++ /dev/null @@ -1,187 +0,0 @@ -# SplitChunkedLoops Pass - -将带有 `chunk` 的循环按两种策略之一拆分为嵌套的外层/内层循环。 - -## 概述 - -此 Pass 将使用 `chunk=C` 创建的 for 循环转换为嵌套循环:外层循环遍历分块索引,内层循环在每个分块内迭代。支持两种生成策略: - -- **`guarded`**(默认)— 发射一个长度为 `ceil(T/C)` 的外层循环和一个长度为 `C` 的内层循环,并用 `if (idx < stop)`(负步长时为 `idx > stop`)包裹循环体。越界迭代变为空操作。只发射一个 kernel。 -- **`leading_full`** — 发射一个长度为 `T/C` 的满块循环加一个长度为 `T % C` 的独立余数循环。发射两个并列循环。 - -两种策略都在 SSA 转换之后运行,并将 `iter_args` 传播到生成的循环中。 - -**前置条件**: `TypeChecked`、`SSAForm`。 - -**使用时机**: 在默认流水线中自动运行,位于 `FlattenCallExpr` 之后、`InterchangeChunkLoops` 之前。在 `with pl.auto_incore():` 作用域内的 `pl.range()`、`pl.parallel()`、`pl.unroll()` 上使用 `chunk=`。`auto_incore` 之外的分块循环不会被拆分。 - -## API - -| C++ | Python | 级别 | -| --- | ------ | ---- | -| `pass::SplitChunkedLoops()` | `passes.split_chunked_loops()` | 函数级 | - -```python -from pypto import passes -result = passes.split_chunked_loops()(program) -``` - -## DSL 语法 - -分块循环必须包裹在 `with pl.auto_incore():` 中: - -```python -with pl.auto_incore(): - # 默认 (guarded):单 kernel + if-guard - for i in pl.range(10, chunk=5): - x = pl.add(x, 1.0) - - # 显式 guarded(与默认等价) - for i in pl.parallel(n, chunk=4, chunk_policy="guarded"): - x = pl.add(x, 1.0) - - # 显式 leading_full:余数剥离为独立循环 - for i in pl.range(7, chunk=5, chunk_policy="leading_full"): - x = pl.add(x, 1.0) - - # 两种策略都支持 iter_args - for i, (s,) in pl.range(10, init_values=(x,), chunk=5): - s = pl.add(s, 1.0) - s = pl.yield_(s) -``` - -## 策略选择 - -| 场景 | 偏好 `guarded` | 偏好 `leading_full` | -| ---- | -------------- | ------------------- | -| 动态 bound(`stop` 非编译期常量) | ✅ —— 单 kernel 保留跨边界的 loop-carried 状态 | ❌ —— 余数 kernel 的 iter_args 只能以 input-only 拷贝方式传入,破坏跨迭代累积 | -| 静态 bound 且可整除 | guard 稍显冗余 | ✅ —— 无 guard、无余数 | -| 希望 `pl.auto_incore()` 下 kernel 数量最少 | ✅ | 每个分块循环会生成 2 个 kernel | -| 希望热点循环内部不存在掩码迭代 | ❌ | ✅ —— 满块无条件执行 | - -`guarded` 被设为默认,原因在于:(1) 动态 bound 下能保留 `add_inout()` 累积;(2) 避免 `pl.auto_incore()` 下 kernel 数量翻倍。 - -## 约束 - -| 约束 | 原因 | -| ---- | ---- | -| `step`、`chunk` 必须为整数常量 | 编译期需要确定值 | -| `chunk` 必须为正整数 | 非正数的分块大小无效 | -| `step` 可以为负(下降循环) | `guarded` 会根据步长符号选择判据 | -| `start`、`stop` 在 `guarded` 下可以是动态表达式 | 迭代次数取 `max(abs(stop - start), 0) / abs(step)` | -| 分块循环必须在 `pl.auto_incore()` 内 | 仅 `auto_incore` 作用域内的循环会被拆分 | -| `chunk` 可以与 `init_values` 同时使用 | 两种策略都会将 iter_args 串联到生成的循环 | - -## 算法 - -记 `T = ceil(max(|stop - start|, 0) / |step|)`,`C = chunk`。 - -### `guarded`(默认) - -1. `n_total = ceil(T / C)`。静态 bound 直接计算,动态 bound 用 `(T + C - 1) // C`。 -2. 发射外层循环 `for out_var in [0, n_total)` 与内层循环 `for in_var in [0, C)`。 -3. 计算 `idx = start + (out_var * C + in_var) * step`,并替换到循环体里。 -4. 将访问后的循环体包裹进 `IfStmt`,条件为: - - `idx < stop`(当 `step > 0`) - - `idx > stop`(当 `step < 0`) -5. **无 iter_args** —— IfStmt 无 else 分支;被跳过的迭代为空操作。 -6. **有 iter_args** —— IfStmt 的 `return_vars` 作为 phi:then 分支保留用户循环体的末尾 `YieldStmt`(更新后的值),else 分支 yield 未变的 inner iter_args。内层循环的末尾 `YieldStmt` 引用 IfStmt 的 phi 变量,从而在生效与被跳过的迭代之间都能串联循环携带状态。 - -### `leading_full` - -1. `n_full = T // C`,`n_rem = T % C`。 -2. 发射外层 `for out_var in [0, n_full)` 与内层 `for in_var in [0, C)`,`idx = start + (out_var * C + in_var) * step`;若 `n_full == 0` 则跳过。 -3. 若 `n_rem > 0`,发射余数循环 `for rem_var in [0, n_rem)`,`idx = start + (n_full * C + rem_var) * step`。其 `init_values` 链接自外层循环的 `return_vars`(如果没有满块循环,则链接自原始 init 值)。 -4. 将原始 `return_vars` 重映射到最终循环的 `return_vars`。 - -两种路径都在内层与外层/余数循环上保留原始的 `ForKind`(Sequential、Parallel、Unroll)。 - -## 自动命名缩写 - -打印出来的 IR 使用紧凑的自动命名格式 `base__qualifier_role_vN`。缩写 qualifier: - -| 缩写 | 含义 | 发射时机 | -| ---- | ---- | -------- | -| `co` | chunk_outer | 两种策略 | -| `ci` | chunk_inner | 两种策略 | -| `cr` | chunk_rem(余数) | 仅 `leading_full` | -| `cg` | chunk_guard(IfStmt phi) | 仅带 iter_args 的 `guarded` | - -示例:`i__co_idx_v0`(外层索引)、`x__ci_iter_v1`(内层 iter_arg)、`x__cr_rv_v1`(余数 return var)、`x__cg_rv_v1`(IfStmt phi 变量)。 - -## 示例 - -### `guarded`,可整除(`chunk=5`,trip_count=10) - -**之后**: - -```python -for i__co_idx_v0, (x__co_iter_v1,) in pl.range(2, init_values=(x__ssa_v0,)): - for i__ci_idx_v0, (x__ci_iter_v1,) in pl.range(5, init_values=(x__co_iter_v1,)): - if i__co_idx_v0 * 5 + i__ci_idx_v0 < 10: - x__ssa_v3 = pl.tensor.add(x__ci_iter_v1, 1.0) - x__cg_rv_v1 = pl.yield_(x__ssa_v3) - else: - x__cg_rv_v1 = pl.yield_(x__ci_iter_v1) - x__ci_rv_v1 = pl.yield_(x__cg_rv_v1) - x__co_rv_v1 = pl.yield_(x__ci_rv_v1) -return x__co_rv_v1 -``` - -### `guarded`,动态 bound(`chunk=4`,`stop=n`) - -**之后**(单 kernel,`n_total = (n + 3) // 4`): - -```python -for i__co_idx_v0, (x__co_iter_v1,) in pl.range((n + 3) // 4, init_values=(x__ssa_v0,)): - for i__ci_idx_v0, (x__ci_iter_v1,) in pl.range(4, init_values=(x__co_iter_v1,)): - if i__co_idx_v0 * 4 + i__ci_idx_v0 < n: - x__ssa_v3 = pl.tensor.add(x__ci_iter_v1, 1.0) - x__cg_rv_v1 = pl.yield_(x__ssa_v3) - else: - x__cg_rv_v1 = pl.yield_(x__ci_iter_v1) - x__ci_rv_v1 = pl.yield_(x__cg_rv_v1) - x__co_rv_v1 = pl.yield_(x__ci_rv_v1) -return x__co_rv_v1 -``` - -### `leading_full`,不可整除(`chunk=5`,trip_count=7) - -**之后**(两个并列循环): - -```python -for i__co_idx_v0, (x__co_iter_v1,) in pl.range(1, init_values=(x__ssa_v0,)): - for i__ci_idx_v0, (x__ci_iter_v1,) in pl.range(5, init_values=(x__co_iter_v1,)): - x__ssa_v3 = pl.tensor.add(x__ci_iter_v1, 1.0) - x__ci_rv_v1 = pl.yield_(x__ssa_v3) - x__co_rv_v1 = pl.yield_(x__ci_rv_v1) -for i__cr_idx_v0, (x__cr_iter_v1,) in pl.range(2, init_values=(x__co_rv_v1,)): - x__ssa_v4 = pl.tensor.add(x__cr_iter_v1, 1.0) - x__cr_rv_v1 = pl.yield_(x__ssa_v4) -return x__cr_rv_v1 -``` - -## LoopOrigin 标记 - -| LoopOrigin | 说明 | 发射时机 | -| ---------- | ---- | -------- | -| `Original` | 普通用户循环(默认) | — | -| `ChunkOuter` | 遍历分块索引的外层循环 | 两种策略 | -| `ChunkInner` | 在分块内迭代的内层循环 | 两种策略 | -| `ChunkRemainder` | 处理剩余迭代的余数循环 | 仅 `leading_full` | - -通过 `for_stmt.attrs.get("loop_origin")`(Python)或 `for_stmt->GetAttr("loop_origin")`(C++)访问。 - -## 流水线位置 - -```text -UnrollLoops → ConvertToSSA → FlattenCallExpr → SplitChunkedLoops → InterchangeChunkLoops → OutlineIncoreScopes → ... -``` - -## Pass 属性 - -| 属性 | 值 | -| ---- | -- | -| Required | `TypeChecked`、`SSAForm` | -| Produced | `TypeChecked`、`SSAForm` | -| Invalidated | (无) | diff --git a/docs/zh-cn/dev/passes/06-interchange_chunk_loops.md b/docs/zh-cn/dev/passes/06-interchange_chunk_loops.md deleted file mode 100644 index a50ebb691..000000000 --- a/docs/zh-cn/dev/passes/06-interchange_chunk_loops.md +++ /dev/null @@ -1,197 +0,0 @@ -# InterchangeChunkLoops Pass - -重新排列嵌套的 ChunkOuter/ChunkInner 循环对并插入 `InCore` 作用域,为下游提取做准备。 - -## 概述 - -在 `SplitChunkedLoops` 将分块循环拆分为嵌套的 `ChunkOuter→ChunkInner` 对之后,嵌套分块循环的结构为: - -```text -i_out[ChunkOuter] → i_in[ChunkInner,Parallel] → j_out[ChunkOuter] → j_in[ChunkInner,Parallel] → body -``` - -此 Pass 重新排列,使所有外层循环在顶部,并将内层循环 + 循环体包裹在 `InCoreScopeStmt` 中: - -```text -i_out[ChunkOuter] → j_out[ChunkOuter] → InCore{ i_in[ChunkInner] → j_in[ChunkInner] → body } -``` - -**前置条件**: TypeChecked、SSAForm 属性。 - -**使用时机**: 在默认流水线中自动运行,位于 `SplitChunkedLoops` 之后、`OutlineIncoreScopes` 之前。仅处理 `pl.auto_incore()` 作用域内的循环。此 Pass 会消费(移除)`AutoInCore` 作用域。 - -## API - -| C++ | Python | 级别 | -| --- | ------ | ---- | -| `pass::InterchangeChunkLoops()` | `passes.interchange_chunk_loops()` | 函数级 | - -**Python 用法**: - -```python -from pypto import passes - -result = passes.interchange_chunk_loops()(program) -``` - -## 约束 - -| 约束 | 行为 | -| ---- | ---- | -| 仅 SSA | 在 `SplitChunkedLoops` 之后运行(需要 `SSAForm`) | -| 仅并行交换 | 仅当所有 ChunkInner 循环具有 `ForKind::Parallel` 时才交换 | -| 顺序分块循环 | 不交换,但如果在 `auto_incore` 内则包裹在 InCore 中 | -| 已有 InCore | 如果链体已包含 `InCoreScopeStmt`,则跳过 | -| 需要 `auto_incore` 作用域 | 仅处理 `AutoInCoreScopeStmt` 内的循环;该作用域会被消费 | - -## 算法 - -1. **收集链** — 从 `ChunkOuter` ForStmt 开始,遍历嵌套的 ForStmt 体。构建 `(ForStmt, LoopOrigin)` 条目列表。在遇到非 ForStmt、`Original` 循环或 `ScopeStmt` 时停止。 - -2. **守卫检查** — 验证所有 ChunkInner 循环为 Parallel。检查最内层循环体中无已有 InCore 作用域。 - -3. **分离** — 将链分为 `outers`(ChunkOuter)和 `inners`(ChunkInner)。 - -4. **重建**(由内到外构建): - - 访问最内层循环体 - - 将 inners 包裹在循环体外(保持顺序),重新连接 iter_args - - 包裹在 `InCoreScopeStmt` 中 - - 将 outers 包裹在 InCore 外(保持顺序),重新连接 iter_args 和 yields - -5. **处理余数** — `ChunkRemainder` 循环:递归进入循环体。将独立的并行余数子循环包裹在 InCore 中。 - -## 自动命名缩写 - -下面示例里的变量名使用了 `base__qualifier_role_vN` 这一紧凑格式,其中 qualifier 有若干缩写: - -| 缩写 | 含义 | -| ---- | ---- | -| `co` | `chunk_outer` | -| `ci` | `chunk_inner` | -| `cr` | `chunk_rem` / 余数分块 | -| `lN` | interchange 之后的第 `N` 层循环 | - -示例: - -- `x__co_iter_v1`:交换前的外层分块 iter_arg -- `x__co_l0_iter_v1`:交换后第 0 层循环上传递的 iter_arg -- `x__co_l2_rv_v1`:从重排后第 2 层循环流出的 return var - -像 `iter`、`rv`、`idx`、`ssa` 这样的 role 不再继续缩写,以便变量用途仍然一眼可见。 - -## 示例 - -**之前**(SplitChunkedLoops 之后,全并行): - -```python -for i__co_idx_v0, (x__co_iter_v1,) in pl.range(2, init_values=(x__ssa_v0,)): # ChunkOuter - for i__ci_idx_v0, (x__ci_iter_v1,) in pl.parallel( - 4, init_values=(x__co_iter_v1,) - ): # ChunkInner - for j__co_idx_v0, (y__co_iter_v1,) in pl.range( - 3, init_values=(x__ci_iter_v1,) - ): # ChunkOuter - for j__ci_idx_v0, (y__ci_iter_v1,) in pl.parallel( - 4, init_values=(y__co_iter_v1,) - ): # ChunkInner - z = pl.add(y__ci_iter_v1, 1.0) - y__ci_rv_v1 = pl.yield_(z) - y__co_rv_v1 = pl.yield_(y__ci_rv_v1) - x__ci_rv_v1 = pl.yield_(y__co_rv_v1) - x__co_rv_v1 = pl.yield_(x__ci_rv_v1) -return x__co_rv_v1 -``` - -**之后**(InterchangeChunkLoops): - -```python -for i__co_idx_v0, (x__co_l0_iter_v1,) in pl.range( - 2, init_values=(x__ssa_v0,) -): # ChunkOuter - for j__co_idx_v0, (x__co_l1_iter_v1,) in pl.range( - 3, init_values=(x__co_l0_iter_v1,) - ): # ChunkOuter - with pl.incore(): # 插入 InCore - for i__ci_idx_v0, (x__co_l2_iter_v1,) in pl.parallel( - 4, init_values=(x__co_l1_iter_v1,) - ): # ChunkInner - for j__ci_idx_v0, (x__co_l3_iter_v1,) in pl.parallel( - 4, init_values=(x__co_l2_iter_v1,) - ): # ChunkInner - z = pl.add(x__co_l3_iter_v1, 1.0) - x__co_l3_rv_v1 = pl.yield_(z) - x__co_l2_rv_v1 = pl.yield_(x__co_l3_rv_v1) - x__co_l1_rv_v1 = pl.yield_(x__co_l2_rv_v1) - x__co_l0_rv_v1 = pl.yield_(x__co_l1_rv_v1) -return x__co_l0_rv_v1 -``` - -## 余数处理 - -对于不整除的迭代次数,余数循环会被包裹在 InCore 中: - -```python -for i_rem, (...) in pl.parallel(2, init_values=(...)): # ChunkRemainder - for j_out, (...) in pl.range(3, init_values=(...)): # 已应用交换 - with pl.incore(): - for j_in, (...) in pl.parallel(4, init_values=(...)): - body - with pl.incore(): # 余数已包裹 - for j_rem, (...) in pl.parallel(2, init_values=(...)): - body -``` - -## 非分块语句处理 - -当 `auto_incore` 被消费时,未被分块交换处理的语句(独立张量算子、非分块循环、未通过并行守卫检查的顺序分块循环)会被包裹在 `InCoreScopeStmt` 中,以确保它们被 `OutlineIncoreScopes` 提取到 InCore 函数中。 - -连续的非 InCore 语句会被分组到单个 `InCoreScopeStmt` 中。控制流语句(`YieldStmt`、`ReturnStmt`)和纯标量赋值(例如索引运算 `offset = ob * 32`)不会被包裹——它们留在编排作用域中。 - -**示例** — 独立算子 + 并行分块: - -```python -# 之前(在 auto_incore 内部,SplitChunkedLoops 之后) -with pl.auto_incore(): - x = pl.add(x, 1.0) # 独立算子 - for i_out in pl.range(2): # ChunkOuter(并行内层) - for i_in in pl.parallel(4): - x = pl.add(x, 2.0) - -# InterchangeChunkLoops 之后 -with pl.incore(): # 独立算子已包裹 - x = pl.add(x, 1.0) -for i_out in pl.range(2): # 已交换的分块 - with pl.incore(): - for i_in in pl.parallel(4): - x = pl.add(x, 2.0) -``` - -**示例** — 顺序分块(未通过交换守卫检查): - -```python -# 之前 -with pl.auto_incore(): - for i_out in pl.range(2): # ChunkOuter(顺序内层) - for i_in in pl.range(4): # ChunkInner,Sequential → 未通过守卫 - x = pl.add(x, 1.0) - -# 之后 — 整个链被包裹在 InCore 中 -with pl.incore(): - for i_out in pl.range(2): - for i_in in pl.range(4): - x = pl.add(x, 1.0) -``` - -## 流水线位置 - -```text -UnrollLoops → ConvertToSSA → FlattenCallExpr → SplitChunkedLoops → InterchangeChunkLoops → OutlineIncoreScopes → ... -``` - -## Pass 属性 - -| 属性 | 值 | -| ---- | -- | -| Required | `TypeChecked`、`SSAForm` | -| Produced | `TypeChecked`、`SSAForm` | -| Invalidated | (无) | diff --git a/docs/zh-cn/dev/passes/06-outline_incore_scopes.md b/docs/zh-cn/dev/passes/06-outline_incore_scopes.md new file mode 100644 index 000000000..b0164195e --- /dev/null +++ b/docs/zh-cn/dev/passes/06-outline_incore_scopes.md @@ -0,0 +1,217 @@ +# OutlineIncoreScopes Pass + +将 `level_ == CORE_GROUP` 的 `HierarchyScopeStmt` 区域提取为独立的 +`Function(InCore)` 定义,并把外层父函数由 `Opaque` 提升为 `Orchestration`。 + +## 概述 + +该 Pass 专门处理 `HierarchyScopeStmt` 的 `CORE_GROUP` 形式 —— 即由 +`with pl.at(level=pl.Level.CORE_GROUP):` 引入的 per-core-group 内核区域。 +对每个此类作用域,它都会提取出一个新的 `Function`,`func_type_` 为 +`FunctionType::InCore`,并将原作用域替换为对该函数的 `Call`。只要从某个 +父函数中提取出至少一个 `CORE_GROUP` 作用域,就把该父函数的 `func_type_` +由 `Opaque` 提升为 `Orchestration`。 + +本 Pass 是 [`OutlineHierarchyScopes`](05-outline_hierarchy_scopes.md) 在 +`CORE_GROUP` 方向的对应 Pass,后者只处理非 `CORE_GROUP` 层级,生成 +`Function(Opaque)` 且不修改父函数类型。 + +| 作用域 `level_` | 提取出的函数类型 | 父函数类型(Pass 后) | +| --------------- | ---------------- | --------------------- | +| `Level.CORE_GROUP` | `FunctionType::InCore` | `Opaque` 提升为 `Orchestration` | +| 其他层级 | *(本 Pass 不处理;已由 `OutlineHierarchyScopes` 提取)* | — | + +当 `CORE_GROUP` 作用域携带 `split_` 优化提示时,会把该提示复制到提取出的 +`InCore` 函数 attrs 中,供下游 Pass(特别是 +[`ExpandMixedKernel`](11-expand_mixed_kernel.md))在决定如何拆分 AIC / +AIV 核时使用。 + +**前置条件**: + +- 输入 IR 必须为 SSA 形式(需先运行 `ConvertToSSA`)。本 Pass 保留 + (产生)SSA 形式。 +- 期望 `OutlineHierarchyScopes` 已经运行过,因此当前只剩下 `CORE_GROUP` + 的 `HierarchyScopeStmt` 节点需要处理。 +- 仅处理 `Opaque` 函数(其中可能残留 `CORE_GROUP` 作用域)。已经为 + `Orchestration`、`InCore`、`AIC`、`AIV`、`Group` 的函数保持不变。 + +**使用时机**:在 [`OutlineHierarchyScopes`](05-outline_hierarchy_scopes.md) +之后、[`OutlineClusterScopes`](07-outline_cluster_scopes.md) 之前运行。 +本 Pass 完成后,`HierarchyOutlined` 属性成立:`Opaque` / `Orchestration` +函数中不再残留任何 `HierarchyScopeStmt` 节点。 + +## API + +| C++ | Python | 级别 | +| --- | ------ | ---- | +| `pass::OutlineIncoreScopes()` | `passes.outline_incore_scopes()` | 程序级 | + +**工厂函数**: + +```cpp +Pass OutlineIncoreScopes(); +``` + +**Python 用法**: + +```python +from pypto.pypto_core import passes + +outline_pass = passes.outline_incore_scopes() +program_outlined = outline_pass(program) +``` + +## 算法 + +1. **扫描 CORE_GROUP 作用域**:在每个 `Opaque` 函数体中查找所有 `level_` + 为 `CORE_GROUP` 的 `HierarchyScopeStmt` 节点。 +2. **分析输入/输出**:复用 scope_outline_utils 辅助工具计算外部定义、内部 + 使用的变量(输入)以及内部定义、外部使用的变量(输出)。 +3. **创建 InCore 函数**:将作用域体提取为新的 `Function`: + - 参数 = 输入变量 + - 返回值 = 输出变量 + - 函数体 = 作用域体 + - `func_type_` = `InCore` + - 将 `role_` 复制到函数 attrs + - 若作用域携带 `split_` 优化提示,将其复制到函数的 `split` attr + (由 `ExpandMixedKernel` 消费) +4. **替换作用域**:将原 `HierarchyScopeStmt` 替换为对提取出 InCore 函数的 + `Call` + 绑定返回值的若干 `AssignStmt`。 +5. **父函数提升**:若父函数中至少有一个 `CORE_GROUP` 作用域被提取,则将 + 该父函数由 `Opaque` 重标记为 `Orchestration`。 +6. **加入程序**:将提取出的 InCore 函数前置到程序的函数列表中。 + +**命名规则**:`{原函数名}_core_group_{计数器}`(例如 +`main_core_group_0`)。提取出的 InCore 函数在 attrs 中使用 `_incore_` +风格的名称后缀,在打印的 IR 中便于识别。若 +`HierarchyScopeStmt.name_hint` 非空,则直接使用该 name_hint。 + +## 示例 + +### CORE_GROUP → InCore + Orchestration + +**之前**(假设 `OutlineHierarchyScopes` 已完成,非 CORE_GROUP 作用域已经 +被提取;CORE_GROUP 作用域仍内联在 `main` 中): + +```python +@pl.program +class Before: + @pl.function # Opaque + def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: + y = x + 1 + + with pl.at(level=pl.Level.CORE_GROUP): + tile = pl.load(y, [0], [64]) + tile_sq = pl.mul(tile, tile) + result_tile = tile_sq + 1 + result = pl.store(result_tile, [0], x) + + z = result + 2 + return z +``` + +**之后**: + +```python +@pl.program +class After: + @pl.function(type=pl.FunctionType.Orchestration) # 已升级 + def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: + y = x + 1 + result = self.main_core_group_0(y, x) # 调用提取出的 InCore 函数 + z = result + 2 + return z + + @pl.function(type=pl.FunctionType.InCore) # 提取出 + def main_core_group_0(self, y: pl.Tensor[[64], pl.FP32], + x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: + tile = pl.load(y, [0], [64]) + tile_sq = pl.mul(tile, tile) + result_tile = tile_sq + 1 + result = pl.store(result_tile, [0], x) + return result +``` + +### 带 split 提示的 CORE_GROUP + +```python +with pl.at(level=pl.Level.CORE_GROUP, + optimizations=[pl.split(pl.SplitMode.UP_DOWN)]): + ... +``` + +提取出的 `InCore` 函数 attrs 中会携带该 `split` 提示,供后续 +`ExpandMixedKernel` 读取以决定 AIC+AIV 拆分方式。 + +### 多输出 + +```python +with pl.at(level=pl.Level.CORE_GROUP): + a_tile = pl.load(a, [0], [64]) + b_tile = pl.load(b, [0], [64]) + c_tile = pl.add(a_tile, b_tile) + out_a = pl.store(c_tile, [0], out) + out_b = pl.mul(c_tile, 2.0) +# out_a 与 out_b 都在作用域之后被使用 +x = out_a + out_b +``` + +提取后,父函数体变为: + +```python +out_a, out_b = self.main_core_group_0(a, b, out) # 多返回值 +x = out_a + out_b +``` + +## 实现 + +**头文件**:`include/pypto/ir/transforms/passes.h` + +```cpp +Pass OutlineIncoreScopes(); +``` + +**实现文件**:`src/ir/transforms/outline_incore_scopes.cpp` + +- 使用公共 `scope_outline_utils` 计算输入/输出 +- 对每个 `CORE_GROUP` 作用域构造新的 `Function(InCore)` +- 将 `role_` / `split_` 元信息复制到提取函数的 attrs +- 当从某父函数中至少提取出一个 `CORE_GROUP` 作用域时,将该父函数 + 由 `Opaque` 重标记为 `Orchestration` + +**Python 绑定**:`python/bindings/modules/passes.cpp` + +```cpp +passes.def("outline_incore_scopes", &pass::OutlineIncoreScopes, + "Outline CORE_GROUP HierarchyScopeStmt regions into Function(InCore) " + "and promote the parent function to Orchestration"); +``` + +**测试**:`tests/ut/ir/transforms/test_outline_incore_scopes.py` + +- 测试 `CORE_GROUP` 作用域 → `InCore` 函数 + 父函数升级为 `Orchestration` +- 测试 `split_` 透传到提取出的 InCore 函数 +- 测试输入/输出分析 +- 测试同一父函数中多个 `CORE_GROUP` 作用域 +- 测试 SSA 保留 + +## Pass 属性 + +| 属性 | 值 | +| ---- | -- | +| 所需 | `SSAForm` | +| 产生 | `SSAForm`, `HierarchyOutlined` | +| 失效 | — | + +`HierarchyOutlined` 由本 Pass 产生(而非 +[`OutlineHierarchyScopes`](05-outline_hierarchy_scopes.md)):两次 outline +Pass 全部结束后,`Opaque`/`Orchestration` 函数中不再残留任何 +`HierarchyScopeStmt` 节点。 + +## 流水线位置 + +```text +... → ConvertToSSA → NormalizeStmtStructure → FlattenCallExpr → +OutlineHierarchyScopes → OutlineIncoreScopes → OutlineClusterScopes → +ConvertTensorToTileOps → ... +``` diff --git a/docs/zh-cn/dev/passes/08-outline_cluster_scopes.md b/docs/zh-cn/dev/passes/07-outline_cluster_scopes.md similarity index 72% rename from docs/zh-cn/dev/passes/08-outline_cluster_scopes.md rename to docs/zh-cn/dev/passes/07-outline_cluster_scopes.md index f295828b5..0c517757e 100644 --- a/docs/zh-cn/dev/passes/08-outline_cluster_scopes.md +++ b/docs/zh-cn/dev/passes/07-outline_cluster_scopes.md @@ -11,7 +11,7 @@ - 输入 IR 必须为静态单赋值 (SSA) 形式(需先运行 ConvertToSSA) - 仅处理 Opaque 和 Orchestration 函数 -**使用时机**:在 `OutlineIncoreScopes` 之后运行,当 IR 包含需要提取的 `with pl.cluster():` 作用域或 standalone `with pl.spmd(...):` 作用域时使用。 +**使用时机**:在 `OutlineHierarchyScopes` 和 `OutlineIncoreScopes` 之后运行,当 IR 包含需要提取的 `with pl.cluster():` 作用域或 standalone `with pl.spmd(...):` 作用域时使用。Cluster 体内可能仍包含由 `OutlineIncoreScopes` 先前生成的 `Function(InCore)` 调用。 ## API @@ -42,7 +42,9 @@ program_outlined = outline_pass(program) ## 示例 -**之前**: +**之前**(假设 `OutlineIncoreScopes` 已经把内层的 +`with pl.at(level=pl.Level.CORE_GROUP): ...` 作用域提取为 `Function(InCore)` +`main_core_group_0`): ```python @pl.program @@ -50,8 +52,7 @@ class Before: @pl.function def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: with pl.cluster(): - with pl.incore(): - y: pl.Tensor[[64], pl.FP32] = pl.add(x, x) + y: pl.Tensor[[64], pl.FP32] = self.main_core_group_0(x) return y ``` @@ -62,8 +63,7 @@ class Before: class After: @pl.function(type=pl.FunctionType.Group) def main_cluster_0(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.incore(): - y: pl.Tensor[[64], pl.FP32] = pl.add(x, x) + y: pl.Tensor[[64], pl.FP32] = self.main_core_group_0(x) return y @pl.function @@ -72,7 +72,9 @@ class After: return y ``` -注意:Cluster 内部的 InCore 作用域在提取的 Group 函数中被保留。可以先运行 `OutlineIncoreScopes` 提取 InCore 作用域再进行聚簇,也可以之后在 Group 函数内提取。 +注意:`OutlineHierarchyScopes` 与 `OutlineIncoreScopes` 均先于本 Pass 运行, +因此 Cluster 体内已经是对 `Function(InCore)` 的调用,而非内联的 +`HierarchyScopeStmt` 节点。提取出的 Group 函数会保留这些调用。 ## Standalone Spmd 示例 @@ -132,12 +134,12 @@ class After: | 产生 | SSAForm, ClusterOutlined | | 失效 | — | -## 与 OutlineIncoreScopes 的关系 +## 与 OutlineHierarchyScopes / OutlineIncoreScopes 的关系 -| 方面 | OutlineIncoreScopes | OutlineClusterScopes | -| ---- | ------------------- | -------------------- | -| 作用域类型 | `ScopeKind::InCore` | `ScopeKind::Cluster` / standalone `ScopeKind::Spmd` | -| 输出函数类型 | `FunctionType::InCore` | `FunctionType::Group` / `FunctionType::Spmd` | -| 命名模式 | `{func}_incore_{n}` | `{func}_cluster_{n}` / `{func}_spmd_{n}` | -| 提升父函数为 | Orchestration | *(不变)* | -| 处理对象 | 仅 Opaque 函数 | Opaque + Orchestration | +| 方面 | OutlineHierarchyScopes | OutlineIncoreScopes | OutlineClusterScopes | +| ---- | ---------------------- | ------------------- | -------------------- | +| 作用域类型 | `HierarchyScopeStmt`(非 CORE_GROUP) | `HierarchyScopeStmt`(CORE_GROUP) | `ClusterScopeStmt` / standalone `SpmdScopeStmt` | +| 输出函数类型 | `FunctionType::Opaque` | `FunctionType::InCore` | `FunctionType::Group` / `FunctionType::Spmd` | +| 命名模式 | `{func}_{level}_{n}` | `{func}_core_group_{n}` | `{func}_cluster_{n}` / `{func}_spmd_{n}` | +| 提升父函数为 | *(不变)* | `Orchestration` | *(不变)* | +| 处理对象 | 仅 `Opaque` 函数 | 仅 `Opaque` 函数 | `Opaque` + `Orchestration` | diff --git a/docs/zh-cn/dev/passes/07-outline_incore_scopes.md b/docs/zh-cn/dev/passes/07-outline_incore_scopes.md deleted file mode 100644 index 90d45bca8..000000000 --- a/docs/zh-cn/dev/passes/07-outline_incore_scopes.md +++ /dev/null @@ -1,173 +0,0 @@ -# OutlineIncoreScopes Pass - -将 InCore 作用域提取为独立函数。 - -## 概述 - -该 Pass 将 `InCoreScopeStmt` 节点变换为独立的 `Function(InCore)` 定义,并将原作用域替换为对提取函数的调用。 - -**前置条件**: - -- 输入 IR 必须为静态单赋值 (SSA) 形式(需先运行 ConvertToSSA);该 Pass 保持(产生)SSAForm -- 仅处理 Opaque 函数(InCore 函数保持不变) - -**使用时机**:在 ConvertToSSA 之后运行,当需要将 InCore 计算区域提取为独立的可调用函数时使用。 - -## API - -| C++ | Python | 级别 | -| --- | ------ | ---- | -| `pass::OutlineIncoreScopes()` | `passes.outline_incore_scopes()` | 程序级 | - -**工厂函数**: - -```cpp -Pass OutlineIncoreScopes(); -``` - -**Python 用法**: - -```python -from pypto.pypto_core import passes - -outline_pass = passes.outline_incore_scopes() -program_outlined = outline_pass(program) -``` - -## 算法 - -1. **扫描 InCore 作用域**:在 Opaque 函数中查找所有 `InCoreScopeStmt` 节点 -2. **分析输入**:确定外部变量引用(在作用域外定义、在作用域内使用的变量) -3. **分析输出**:确定在作用域之后仍被使用的内部定义(在作用域内定义、在作用域外使用的变量) -4. **创建函数**:将作用域体提取为新的 `Function(scope_type=InCore)`,其中: - - 参数 = 输入变量 - - 返回值 = 输出变量 - - 函数体 = 作用域体 -5. **替换作用域**:将 `InCoreScopeStmt` 替换为: - - 带有输入参数的提取函数调用 - - 每个输出变量对应一个 AssignStmt -6. **添加到程序**:将提取的函数添加到程序的函数列表中 - -**命名规则**: - -- 默认:`{原函数名}_incore_{计数器}`(如 `main_incore_0`、`main_incore_1`) -- 用户自定义:当 `InCoreScopeStmt.name_hint` 非空时,直接使用该名称 - - `with pl.incore(name_hint="fused_add"):` → 函数名为 `fused_add` - -## 示例 - -### 基本提取 - -**之前**: - -```python -@pl.program -class Before: - @pl.function # Opaque function - def main(self, x: Tensor[[64], FP32]) -> Tensor[[64], FP32]: - y = x + 1 - - with pl.incore(): # InCore scope - tile = pl.load(y, [0], [64]) - tile_sq = pl.mul(tile, tile) - result_tile = tile_sq + 1 - result = pl.store(result_tile, [0], x) - - z = result + 2 - return z -``` - -**之后**: - -```python -@pl.program -class After: - @pl.function # Opaque function - def main(self, x: Tensor[[64], FP32]) -> Tensor[[64], FP32]: - y = x + 1 - - # Scope replaced with call + assignments - result = self.main_incore_0(y, x) # Call outlined function - - z = result + 2 - return z - - @pl.function(scope_type=InCore) # Outlined InCore function - def main_incore_0(self, y: Tensor[[64], FP32], x: Tensor[[64], FP32]) -> Tensor[[64], FP32]: - # Scope body moved here - tile = pl.load(y, [0], [64]) - tile_sq = pl.mul(tile, tile) - result_tile = tile_sq + 1 - result = pl.store(result_tile, [0], x) - return result -``` - -### 多输出 - -**之前**: - -```python -with pl.incore(): - a_tile = pl.load(a, [0], [64]) - b_tile = pl.load(b, [0], [64]) - c_tile = pl.add(a_tile, b_tile) - out_a = pl.store(c_tile, [0], out) - out_b = pl.mul(c_tile, 2.0) -# Both out_a and out_b used after scope -x = out_a + out_b -``` - -**之后**: - -```python -out_a, out_b = self.main_incore_0(a, b, out) # Multiple outputs -x = out_a + out_b - -# Outlined function: -def main_incore_0(self, a, b, out): - a_tile = pl.load(a, [0], [64]) - b_tile = pl.load(b, [0], [64]) - c_tile = pl.add(a_tile, b_tile) - out_a = pl.store(c_tile, [0], out) - out_b = pl.mul(c_tile, 2.0) - return (out_a, out_b) -``` - -## 实现 - -**头文件**:`include/pypto/ir/transforms/passes.h` - -```cpp -Pass OutlineIncoreScopes(); -``` - -**实现文件**:`src/ir/transforms/outline_incore_scopes.cpp` - -- 使用 SSA 分析确定输入/输出 -- 创建带有 InCore 作用域类型的新 Function 节点 -- 将 InCoreScopeStmt 替换为 Call + AssignStmt -- 管理函数命名和计数器 - -**Python 绑定**:`python/bindings/modules/passes.cpp` - -```cpp -passes.def("outline_incore_scopes", &pass::OutlineIncoreScopes, "Outline InCore scopes"); -``` - -**测试**:`tests/ut/ir/transforms/test_outline_incore_scopes.py` - -- 测试基本作用域提取 -- 测试输入/输出分析 -- 测试同一函数中的多个作用域 -- 测试嵌套作用域 -- 测试 SSA 保持 - -## 前置条件 - -**需要 SSA 形式**:该 Pass 依赖 SSA 属性 (Property): - -- 单赋值确保清晰的输入/输出分析 -- 无变量遮蔽简化了作用域分析 -- 控制流中的 YieldStmt 被正确处理 - -如果 IR 不是 SSA 形式,**请先运行 ConvertToSSA**。 diff --git a/docs/zh-cn/dev/passes/09-convert_tensor_to_tile_ops.md b/docs/zh-cn/dev/passes/08-convert_tensor_to_tile_ops.md similarity index 90% rename from docs/zh-cn/dev/passes/09-convert_tensor_to_tile_ops.md rename to docs/zh-cn/dev/passes/08-convert_tensor_to_tile_ops.md index 22097f490..02c114d8a 100644 --- a/docs/zh-cn/dev/passes/09-convert_tensor_to_tile_ops.md +++ b/docs/zh-cn/dev/passes/08-convert_tensor_to_tile_ops.md @@ -4,14 +4,14 @@ ## 概述 -`OutlineIncoreScopes` 将 InCore 作用域提取为独立函数后,这些函数仍使用 `TensorType` 变量和 `tensor.*` 操作。本 pass 将其降级为直接映射到 PTO-ISA 指令的 `TileType` 变量和 `tile.*` 操作。 +`OutlineHierarchyScopes` 和 `OutlineIncoreScopes` 将 `HierarchyScopeStmt` 区域提取为独立函数(其中 `OutlineIncoreScopes` 对 `CORE_GROUP` 作用域产生 `Function(InCore)`)后,这些 InCore 函数仍使用 `TensorType` 变量和 `tensor.*` 操作。本 pass 将其降级为直接映射到 PTO-ISA 指令的 `TileType` 变量和 `tile.*` 操作。 本 pass 还会更新编排/不透明函数中的调用点:为 InCore 函数新增的每个输出参数,在调用点插入 `tensor.create`。 **前置条件**: - 输入 IR 必须为 SSA 形式 -- InCore 作用域必须已提取(需先运行 `OutlineIncoreScopes`) +- Hierarchy 作用域必须已提取为独立函数(需先运行 `OutlineHierarchyScopes` 和 `OutlineIncoreScopes`) - 语句结构必须已规范化 **使用时机**:在 `OutlineClusterScopes` 之后、`OptimizeOrchTensors` 之前运行。 @@ -119,7 +119,7 @@ class After: | 属性 | 值 | | ---- | -- | -| Required | SSAForm, SplitIncoreOrch, NormalizedStmtStructure | +| Required | SSAForm, HierarchyOutlined, NormalizedStmtStructure | | Produced | SSAForm, IncoreTileOps, NormalizedStmtStructure | | Invalidated | — | diff --git a/docs/zh-cn/dev/passes/10-optimize_orch_tensors.md b/docs/zh-cn/dev/passes/09-optimize_orch_tensors.md similarity index 98% rename from docs/zh-cn/dev/passes/10-optimize_orch_tensors.md rename to docs/zh-cn/dev/passes/09-optimize_orch_tensors.md index aa0ac9438..e54867d42 100644 --- a/docs/zh-cn/dev/passes/10-optimize_orch_tensors.md +++ b/docs/zh-cn/dev/passes/09-optimize_orch_tensors.md @@ -132,8 +132,8 @@ class After: | 属性 | 值 | | ---- | -- | -| Required | SplitIncoreOrch, IncoreTileOps | -| Produced | SplitIncoreOrch, IncoreTileOps | +| Required | HierarchyOutlined, IncoreTileOps | +| Produced | HierarchyOutlined, IncoreTileOps | | Invalidated | — | ## 关键组件 diff --git a/docs/zh-cn/dev/passes/11-flatten_tile_nd_to_2d.md b/docs/zh-cn/dev/passes/10-flatten_tile_nd_to_2d.md similarity index 100% rename from docs/zh-cn/dev/passes/11-flatten_tile_nd_to_2d.md rename to docs/zh-cn/dev/passes/10-flatten_tile_nd_to_2d.md diff --git a/docs/zh-cn/dev/passes/14-expand_mixed_kernel.md b/docs/zh-cn/dev/passes/11-expand_mixed_kernel.md similarity index 96% rename from docs/zh-cn/dev/passes/14-expand_mixed_kernel.md rename to docs/zh-cn/dev/passes/11-expand_mixed_kernel.md index 9e04fd2da..29f3d93c0 100644 --- a/docs/zh-cn/dev/passes/14-expand_mixed_kernel.md +++ b/docs/zh-cn/dev/passes/11-expand_mixed_kernel.md @@ -4,7 +4,7 @@ ## 概述 -在 `OutlineIncoreScopes` 和 `ConvertTensorToTileOps` 之后,InCore 函数可能同时包含 Cube 操作(`tile.matmul`、`tile.gemv` 等)和 Vector 操作(`tile.add`、`tile.exp` 等)。部分操作如 `tile.load`、`tile.store`、`tile.move`、`tile.reshape` 根据其 tile 操作数的 MemorySpace 被分类为 Cube 或 Vector。包含两侧操作的函数是**混合 InCore 函数**。硬件要求 Cube 和 Vector 操作在不同的核心类型上运行,因此该 Pass 将它们拆分为: +在 `OutlineHierarchyScopes` 和 `ConvertTensorToTileOps` 之后,InCore 函数可能同时包含 Cube 操作(`tile.matmul`、`tile.gemv` 等)和 Vector 操作(`tile.add`、`tile.exp` 等)。部分操作如 `tile.load`、`tile.store`、`tile.move`、`tile.reshape` 根据其 tile 操作数的 MemorySpace 被分类为 Cube 或 Vector。包含两侧操作的函数是**混合 InCore 函数**。硬件要求 Cube 和 Vector 操作在不同的核心类型上运行,因此该 Pass 将它们拆分为: - **AIC 函数**(`FunctionType::AIC`)— 仅包含 Cube + 共享操作 - **AIV 函数**(`FunctionType::AIV`)— 仅包含 Vector + 共享操作 @@ -76,7 +76,7 @@ Ascend910B(a2a3)——跨核传输经过 GM → Mat,Mat 仅支持 NZ 布 **前置条件**: - 输入 IR 必须具有 tile 操作(需先运行 `ConvertTensorToTileOps`) -- 输入 IR 必须已提取 InCore 作用域(需先运行 `OutlineIncoreScopes`) +- 输入 IR 必须已提取 Hierarchy 作用域为独立函数(需先运行 `OutlineHierarchyScopes`) - Tile 操作必须已展平为 2D(需先运行 `FlattenTileNdTo2D`) - Tile 内存空间必须已推断(需先运行 `InferTileMemorySpace`) - 跨核 Fractal TileView 分配在 Ascend950 和 Ascend910B 后端均受支持 @@ -289,7 +289,7 @@ class After: | 属性 | 值 | | ---- | -- | -| 所需 | SSAForm, IncoreTileOps, SplitIncoreOrch, TileOps2D, TileMemoryInferred | +| 所需 | SSAForm, IncoreTileOps, HierarchyOutlined, TileOps2D, TileMemoryInferred | | 产生 | SSAForm, MixedKernelExpanded | | 失效 | — | diff --git a/docs/zh-cn/dev/passes/15-init_memref.md b/docs/zh-cn/dev/passes/12-init_memref.md similarity index 98% rename from docs/zh-cn/dev/passes/15-init_memref.md rename to docs/zh-cn/dev/passes/12-init_memref.md index 954def992..db5c04692 100644 --- a/docs/zh-cn/dev/passes/15-init_memref.md +++ b/docs/zh-cn/dev/passes/12-init_memref.md @@ -12,7 +12,7 @@ 内存空间从 `TileType::memory_space_` 读取(由 InferTileMemorySpace 设置)。无 `memory_space` 的变量默认为 DDR。 -**需要**:SSAForm、SplitIncoreOrch、IncoreTileOps、TileOps2D、TileMemoryInferred。 +**需要**:SSAForm、HierarchyOutlined、IncoreTileOps、TileOps2D、TileMemoryInferred。 **产生**:HasMemRefs、NormalizedStmtStructure。 diff --git a/docs/zh-cn/dev/passes/16-memory_reuse.md b/docs/zh-cn/dev/passes/13-memory_reuse.md similarity index 100% rename from docs/zh-cn/dev/passes/16-memory_reuse.md rename to docs/zh-cn/dev/passes/13-memory_reuse.md diff --git a/docs/zh-cn/dev/passes/17-allocate_memory_addr.md b/docs/zh-cn/dev/passes/14-allocate_memory_addr.md similarity index 100% rename from docs/zh-cn/dev/passes/17-allocate_memory_addr.md rename to docs/zh-cn/dev/passes/14-allocate_memory_addr.md diff --git a/docs/zh-cn/dev/passes/20-partial_unroll_tile_loops.md b/docs/zh-cn/dev/passes/15-partial_unroll_tile_loops.md similarity index 97% rename from docs/zh-cn/dev/passes/20-partial_unroll_tile_loops.md rename to docs/zh-cn/dev/passes/15-partial_unroll_tile_loops.md index bb62dcfe9..8ed13984d 100644 --- a/docs/zh-cn/dev/passes/20-partial_unroll_tile_loops.md +++ b/docs/zh-cn/dev/passes/15-partial_unroll_tile_loops.md @@ -8,7 +8,7 @@ `PartialUnrollTileLoops` 提供更精细的开关:在 tile 层级把循环体复制 `F` 份(典型值 2–4),保留外层 `N/F` 次顺序迭代。每个副本获得独立的定义变量(保持 SSA),各自操作独立的 tile,下游 `MemoryReuse` 无法将其合并。 -**前置条件**: SSAForm、SplitIncoreOrch、IncoreTileOps、TileOps2D、TileMemoryInferred、NormalizedStmtStructure。 +**前置条件**: SSAForm、HierarchyOutlined、IncoreTileOps、TileOps2D、TileMemoryInferred、NormalizedStmtStructure。 **流水线位置**: 位于 `NormalizeReturnOrder` 之后、`InitMemRef` 之前(slot 20.5)。此时 tile 结构决策已完成;同时早于 `InitMemRef`/`MemoryReuse`,使其看到每个副本独立的 tile 变量。 @@ -157,6 +157,6 @@ else: ## 相关 -- [`ReorderUnrolledIO`](21-reorder_unrolled_io.md) —— 消费 `unroll_replicated` 标记 +- [`ReorderUnrolledIO`](16-reorder_unrolled_io.md) —— 消费 `unroll_replicated` 标记 - [`UnrollLoops`](01-unroll_loops.md) —— slot #1 的全展开 Pass,仍是 `pl.unroll(N)` 的主要降级路径 - RFC #1025 —— 设计文档 diff --git a/docs/zh-cn/dev/passes/21-reorder_unrolled_io.md b/docs/zh-cn/dev/passes/16-reorder_unrolled_io.md similarity index 95% rename from docs/zh-cn/dev/passes/21-reorder_unrolled_io.md rename to docs/zh-cn/dev/passes/16-reorder_unrolled_io.md index 35347db47..cb9053a47 100644 --- a/docs/zh-cn/dev/passes/21-reorder_unrolled_io.md +++ b/docs/zh-cn/dev/passes/16-reorder_unrolled_io.md @@ -14,7 +14,7 @@ 只要数据流允许,结果即为 `[loads…, compute…, stores…]`。各克隆的输入 tile 在顶部同时活跃,输出 tile 在底部同时活跃 —— `MemoryReuse` 无法合并它们,每个克隆保留独立的 MemRef,从而 ping-pong 缓冲成为可能。 -**前置条件**: SSAForm、SplitIncoreOrch、IncoreTileOps、TileOps2D、TileMemoryInferred、NormalizedStmtStructure。 +**前置条件**: SSAForm、HierarchyOutlined、IncoreTileOps、TileOps2D、TileMemoryInferred、NormalizedStmtStructure。 **流水线位置**: 位于 `PartialUnrollTileLoops` 之后、`InitMemRef` 之前(slot 20.6)。在 `InitMemRef` 之前运行可保留 SSAForm,依赖分析正常工作。 @@ -112,7 +112,7 @@ for i in pl.range(0, 8, 4, attrs={"unroll_replicated": 4}): ## 相关 -- [`PartialUnrollTileLoops`](20-partial_unroll_tile_loops.md) —— 生成本 Pass 消费的 `unroll_replicated` 标记 -- [`MemoryReuse`](16-memory_reuse.md) —— 在本 Pass 之后运行;受益于同时活跃的 tile +- [`PartialUnrollTileLoops`](15-partial_unroll_tile_loops.md) —— 生成本 Pass 消费的 `unroll_replicated` 标记 +- [`MemoryReuse`](13-memory_reuse.md) —— 在本 Pass 之后运行;受益于同时活跃的 tile - RFC #1025 —— 设计文档 - RFC #1026 / PR #1029 —— InOut-use 规约 + 依赖分析工具 diff --git a/docs/zh-cn/dev/passes/99-verifier.md b/docs/zh-cn/dev/passes/99-verifier.md index dfe280cc9..b183b25ae 100644 --- a/docs/zh-cn/dev/passes/99-verifier.md +++ b/docs/zh-cn/dev/passes/99-verifier.md @@ -15,7 +15,7 @@ - **可插拔规则系统**:可通过自定义验证规则进行扩展 - **基于属性的验证**:选择性属性集——精确验证所需内容 -- **结构性属性 (Structural Properties)**:TypeChecked、BreakContinueValid、NoRedundantBlocks、UseAfterDef、OutParamNotShadowed、NoNestedInCore 和 InOutUseValid 在流水线启动时由 `PassPipeline` 验证,并由 `VerificationInstrument` 在每个 Pass 执行前后验证 +- **结构性属性 (Structural Properties)**:TypeChecked、BreakContinueValid、NoRedundantBlocks、UseAfterDef、OutParamNotShadowed 和 InOutUseValid 在流水线启动时由 `PassPipeline` 验证,并由 `VerificationInstrument` 在每个 Pass 执行前后验证 - **双重验证模式**:收集诊断信息或在首个错误时抛出异常 - **Pass 集成**:可作为优化流水线中的 Pass 使用 - **全面的诊断信息**:收集所有问题及源码位置 @@ -26,10 +26,10 @@ | 类别 | 示例 | 行为 | | ---- | ---- | ---- | -| **结构性** | TypeChecked, BreakContinueValid, NoRedundantBlocks, UseAfterDef, OutParamNotShadowed, NoNestedInCore, InOutUseValid | 始终为真。在流水线启动时验证,并由 `VerificationInstrument` 在每个 Pass 执行前后验证。不在 PassProperties 中声明。 | +| **结构性** | TypeChecked, BreakContinueValid, NoRedundantBlocks, UseAfterDef, OutParamNotShadowed, InOutUseValid | 始终为真。在流水线启动时验证,并由 `VerificationInstrument` 在每个 Pass 执行前后验证。不在 PassProperties 中声明。 | | **流水线** | SSAForm, NoNestedCalls, HasMemRefs, ... | 由 Pass 产生/失效。按 Pass 声明的契约验证。 | -`GetStructuralProperties()` 返回 `{TypeChecked, BreakContinueValid, NoRedundantBlocks, UseAfterDef, OutParamNotShadowed, NoNestedInCore, InOutUseValid}`。这些在 `PassPipeline::Run()` 中**于流水线启动时验证**,并由 `VerificationInstrument` **在每个 Pass 执行前后验证**。由于没有 Pass 在 `required`/`produced`/`invalidated` 中声明它们,`VerificationInstrument` 将它们与 Pass 声明的属性合并,确保没有 Pass 破坏这些基本不变量。 +`GetStructuralProperties()` 返回 `{TypeChecked, BreakContinueValid, NoRedundantBlocks, UseAfterDef, OutParamNotShadowed, InOutUseValid}`。这些在 `PassPipeline::Run()` 中**于流水线启动时验证**,并由 `VerificationInstrument` **在每个 Pass 执行前后验证**。由于没有 Pass 在 `required`/`produced`/`invalidated` 中声明它们,`VerificationInstrument` 将它们与 Pass 声明的属性合并,确保没有 Pass 破坏这些基本不变量。 ### 验证规则系统 @@ -68,12 +68,11 @@ | **UseAfterDefCheck** | UseAfterDef | 每个 Var 使用均由定义支配(参数、AssignStmt、循环变量、iter_arg、return_var) | | **NormalizedStmtStructure** | NormalizedStmtStructure | 展平嵌套 `SeqStmts` 并解包单子节点 `SeqStmts` | | **NoRedundantBlocks** | NoRedundantBlocks | 无单子节点或嵌套的 `SeqStmts` | -| **SplitIncoreOrch** | SplitIncoreOrch | Opaque 函数中不残留 `InCoreScopeStmt` 节点 | +| **HierarchyOutlined** | HierarchyOutlined | `Opaque`/`Orchestration` 函数中不残留 `HierarchyScopeStmt` 节点 | | **IncoreTileOps** | IncoreTileOps | InCore 函数使用 tile 操作(无张量级操作残留) | | **HasMemRefs** | HasMemRefs | 所有 TileType 变量已初始化 MemRef | | **AllocatedMemoryAddr** | AllocatedMemoryAddr | 所有 MemRef 在缓冲区限制内具有有效地址 | | **OutParamNotShadowed** | OutParamNotShadowed | Out/InOut 参数未被张量创建操作重新赋值 | -| **NoNestedInCore** | NoNestedInCore | 无嵌套 InCore 作用域(`InCoreScopeStmt` 内含 `InCoreScopeStmt`) | | **InOutUseValid** | InOutUseValid | 作为 InOut/Out 传入用户函数调用的变量,在调用之后不得再被读取(RFC #1026)。Group 类型函数体目前跳过,待后续完善。 | ### SSAVerify @@ -161,8 +160,8 @@ | 函数 | 返回值 | 描述 | | ---- | ------ | ---- | -| `GetStructuralProperties()` | `{TypeChecked, BreakContinueValid, NoRedundantBlocks, UseAfterDef, OutParamNotShadowed, NoNestedInCore, InOutUseValid}` | 在流水线启动时及每个 Pass 执行前后验证的不变量 | -| `GetDefaultVerifyProperties()` | `{SSAForm, TypeChecked, NoNestedCalls, BreakContinueValid, NoRedundantBlocks, UseAfterDef, OutParamNotShadowed, NoNestedInCore}` | `run_verifier()` 的默认属性集 | +| `GetStructuralProperties()` | `{TypeChecked, BreakContinueValid, NoRedundantBlocks, UseAfterDef, OutParamNotShadowed, InOutUseValid}` | 在流水线启动时及每个 Pass 执行前后验证的不变量 | +| `GetDefaultVerifyProperties()` | `{SSAForm, TypeChecked, NoNestedCalls, BreakContinueValid, NoRedundantBlocks, UseAfterDef, OutParamNotShadowed}` | `run_verifier()` 的默认属性集 | | `GetVerifiedProperties()` | `{SSAForm, TypeChecked, AllocatedMemoryAddr, BreakContinueValid, NoRedundantBlocks, InOutUseValid}` | `PassPipeline` 自动验证的轻量级属性集 | ### RunVerifier Pass 工厂 diff --git a/docs/zh-cn/user/01-language_guide.md b/docs/zh-cn/user/01-language_guide.md index e8c93a37b..2de7e516c 100644 --- a/docs/zh-cn/user/01-language_guide.md +++ b/docs/zh-cn/user/01-language_guide.md @@ -410,50 +410,22 @@ class Model: 将代码区域标记为 InCore 执行,无需创建单独的函数: ```python -# 推荐用法(新 API): with pl.at(level=pl.Level.CORE_GROUP): y: pl.Tensor[[64], pl.FP32] = pl.add(x, x) - -# 已弃用(请改用 pl.at): -with pl.incore(): - y: pl.Tensor[[64], pl.FP32] = pl.add(x, x) ``` -如需编译器驱动的 chunked 循环 outline(AutoInCore),在 `optimizations` 列表中传入 -`pl.auto_chunk`: +`OutlineIncoreScopes` 之后会把该区域提取为 `Function(InCore)`,并把父 +`Opaque` 函数升级为 `Orchestration`。(非 CORE_GROUP 的 `pl.at(level=...)` +区域则由先行的 `OutlineHierarchyScopes` 提取为 `Function(Opaque)`,不会 +提升父函数类型。) -```python -# 推荐用法(新 API): -with pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.auto_chunk]): - for i in pl.parallel(0, 8, 1, chunk=4): - x = pl.add(x, x) - -# 已弃用(仍可用,会触发 DeprecationWarning): -with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - ... - -with pl.auto_incore(): - ... -``` - -如需为 `ExpandMixedKernel` Pass 指定跨核 split 模式,使用 `pl.split(...)` —— 它与 -`pl.auto_chunk` 互相独立,可任意组合: +如需为 `ExpandMixedKernel` Pass 指定跨核 split 模式,在 `optimizations` +列表中传入 `pl.split(...)`: ```python -# 普通 InCore + split 提示: with pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.split(pl.SplitMode.UP_DOWN)]): y: pl.Tensor[[64], pl.FP32] = pl.add(x, x) - -# AutoInCore + split 提示(独立条目,自由组合): -with pl.at(level=pl.Level.CORE_GROUP, - optimizations=[pl.auto_chunk, pl.split(pl.SplitMode.UP_DOWN)]): - for i in pl.parallel(0, 8, 1, chunk=4): - x = pl.add(x, x) - -# 已弃用的单关键字形式(仍可用,会触发 DeprecationWarning): -with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN): - ... ``` ## 内存与数据搬运 @@ -541,22 +513,28 @@ output_dir = ir.compile( 1. **UnrollLoops** —— 展开循环迭代 2. **CtrlFlowTransform** —— 将控制流改写为结构化 IR 3. **ConvertToSSA** —— 转换为静态单赋值形式 -4. **FlattenCallExpr** —— 展平嵌套函数调用 -5. **SplitChunkedLoops** —— 将分块循环拆分为独立循环 -6. **InterchangeChunkLoops** —— 交换分块循环顺序 -7. **OutlineHierarchyScopes** —— 提取 hierarchy 作用域 -8. **OutlineIncoreScopes** —— 将 InCore 作用域提取为独立函数 -9. **OutlineClusterScopes** —— 提取 cluster 作用域 -10. **ConvertTensorToTileOps** —— 将张量操作转换为 tile 操作 +4. **NormalizeStmtStructure** —— 展平/解包冗余的 `SeqStmts` +5. **FlattenCallExpr** —— 展平嵌套函数调用 +6. **OutlineHierarchyScopes** —— 将非 CORE_GROUP 的 `HierarchyScopeStmt` 区域提取为 `Function(Opaque)` +7. **OutlineIncoreScopes** —— 将 CORE_GROUP 的 `HierarchyScopeStmt` 区域提取为 `Function(InCore)`,并把父函数升级为 `Orchestration` +8. **OutlineClusterScopes** —— 将 cluster 作用域提取为 Group 函数 +9. **ConvertTensorToTileOps** —— 将张量操作转换为 tile 操作 +10. **OptimizeOrchTensors** —— 优化编排层张量操作 11. **FlattenTileNdTo2D** —— 将 ND tile 操作规范化为 2D 12. **InferTileMemorySpace** —— 推断 tile 内存空间 13. **ResolveTransposeLayout** —— 修复转置布局处理 14. **ResolveBackendOpLayouts** —— 修复 backend 受限的 tile 布局 15. **ExpandMixedKernel** —— 在需要时拆分 mixed kernel -16. **InitMemRef** —— 分配内存空间并插入缓冲区分配 -17. **MemoryReuse** —— 共享生命周期不重叠的缓冲区 -18. **LegalizePTOBufferReuse** —— 规范化 PTO 缓冲区复用模式 -19. **AllocateMemoryAddr** —— 分配具体内存地址 +16. **SplitVectorKernel** —— 在需要时拆分 vector kernel +17. **NormalizeReturnOrder** —— 按 Out/InOut 参数顺序重排返回值 +18. **PartialUnrollTileLoops** —— 在 tile 层部分展开循环 +19. **ReorderUnrolledIO** —— 将展开副本的 load/store 分组 +20. **InitMemRef** —— 分配内存空间并插入缓冲区分配 +21. **MemoryReuse** —— 共享生命周期不重叠的缓冲区 +22. **LegalizePTOBufferReuse** —— 规范化 PTO 缓冲区复用模式 +23. **AllocateMemoryAddr** —— 分配具体内存地址 +24. **FuseCreateAssembleToSlice** —— 融合 create + assemble 操作 +25. **Simplify** —— 最终简化 Pass ### 调试 diff --git a/docs/zh-cn/user/02-operation_reference.md b/docs/zh-cn/user/02-operation_reference.md index 5466e333d..dc9296348 100644 --- a/docs/zh-cn/user/02-operation_reference.md +++ b/docs/zh-cn/user/02-operation_reference.md @@ -210,6 +210,8 @@ | `yield_` | `(*values: Any) -> Any \| tuple[Any, ...]` | 从 for/if 作用域 yield 值 | | `cond` | `(condition: bool \| Scalar) -> None` | 设置 while 循环条件(必须是第一条语句) | | `const` | `(value: int \| float, dtype: DataType) -> int \| float` | 类型化常量 | -| `incore` | `() -> IncoreContext` | InCore 作用域的上下文管理器 | +| `at` | `(*, level: Level, role: Role \| None = None, optimizations: Sequence[Optimization] \| None = None) -> AtContext` | 层级作用域的上下文管理器;`level=Level.CORE_GROUP` 即为 InCore 形式 | +| `cluster` | `() -> ClusterContext` | cluster(AIC+AIV)作用域的上下文管理器 | +| `spmd` | `(*, core_num: int \| Scalar, sync_start: bool = False) -> SpmdContext` | standalone SPMD 启动作用域的上下文管理器 | | `dynamic` | `(name: str) -> DynVar` | 创建动态维度变量 | | `create_tensor` | `(shape: Sequence[IntLike], dtype: DataType) -> Tensor` | 创建张量(从 `pl.tensor` 提升) | diff --git a/include/pypto/ir/core.h b/include/pypto/ir/core.h index a247f3517..8f0f4303e 100644 --- a/include/pypto/ir/core.h +++ b/include/pypto/ir/core.h @@ -89,9 +89,7 @@ enum class ObjectKind { ReturnStmt, ForStmt, WhileStmt, - // Scope statement kinds (split from former ScopeStmt — see issue #1047) - InCoreScopeStmt, - AutoInCoreScopeStmt, + // Scope statement kinds (typed hierarchy — see issue #1047) ClusterScopeStmt, HierarchyScopeStmt, SpmdScopeStmt, diff --git a/include/pypto/ir/kind_traits.h b/include/pypto/ir/kind_traits.h index cfd7adcd1..de1961233 100644 --- a/include/pypto/ir/kind_traits.h +++ b/include/pypto/ir/kind_traits.h @@ -88,8 +88,6 @@ DEFINE_KIND_TRAIT(YieldStmt, ObjectKind::YieldStmt) DEFINE_KIND_TRAIT(ReturnStmt, ObjectKind::ReturnStmt) DEFINE_KIND_TRAIT(ForStmt, ObjectKind::ForStmt) DEFINE_KIND_TRAIT(WhileStmt, ObjectKind::WhileStmt) -DEFINE_KIND_TRAIT(InCoreScopeStmt, ObjectKind::InCoreScopeStmt) -DEFINE_KIND_TRAIT(AutoInCoreScopeStmt, ObjectKind::AutoInCoreScopeStmt) DEFINE_KIND_TRAIT(ClusterScopeStmt, ObjectKind::ClusterScopeStmt) DEFINE_KIND_TRAIT(HierarchyScopeStmt, ObjectKind::HierarchyScopeStmt) DEFINE_KIND_TRAIT(SpmdScopeStmt, ObjectKind::SpmdScopeStmt) @@ -127,19 +125,17 @@ struct KindTrait { static constexpr ObjectKind kinds[] = {ObjectKind::AssignStmt, ObjectKind::IfStmt, ObjectKind::YieldStmt, ObjectKind::ReturnStmt, ObjectKind::ForStmt, ObjectKind::WhileStmt, - ObjectKind::InCoreScopeStmt, ObjectKind::AutoInCoreScopeStmt, ObjectKind::ClusterScopeStmt, ObjectKind::HierarchyScopeStmt, ObjectKind::SpmdScopeStmt, ObjectKind::SeqStmts, ObjectKind::EvalStmt, ObjectKind::BreakStmt, ObjectKind::ContinueStmt}; - static constexpr size_t count = 15; + static constexpr size_t count = 13; }; -// ScopeStmt base class - matches any scope kind (5 derived classes) +// ScopeStmt base class - matches any scope kind (3 derived classes) template <> struct KindTrait { - static constexpr ObjectKind kinds[] = {ObjectKind::InCoreScopeStmt, ObjectKind::AutoInCoreScopeStmt, - ObjectKind::ClusterScopeStmt, ObjectKind::HierarchyScopeStmt, + static constexpr ObjectKind kinds[] = {ObjectKind::ClusterScopeStmt, ObjectKind::HierarchyScopeStmt, ObjectKind::SpmdScopeStmt}; static constexpr size_t count = sizeof(kinds) / sizeof(ObjectKind); }; diff --git a/include/pypto/ir/stmt.h b/include/pypto/ir/stmt.h index 107834763..85073e3ee 100644 --- a/include/pypto/ir/stmt.h +++ b/include/pypto/ir/stmt.h @@ -107,13 +107,15 @@ struct ChunkConfig { /** * @brief Loop origin classification for tracking how a loop was generated * - * Used by SplitChunkedLoops to tag each generated loop with its origin. + * The Chunk* values were originally produced by the deleted SplitChunkedLoops + * pass; they remain bound for user-visible attrs but no built-in pass currently + * emits them. */ enum class LoopOrigin : uint8_t { Original = 0, ///< Regular loop (default) - ChunkOuter = 1, ///< Outer loop from chunk splitting - ChunkInner = 2, ///< Inner loop from chunk splitting - ChunkRemainder = 3 ///< Remainder loop from chunk splitting + ChunkOuter = 1, ///< Outer loop from chunk splitting (no producer pass — user attr only) + ChunkInner = 2, ///< Inner loop from chunk splitting (no producer pass — user attr only) + ChunkRemainder = 3 ///< Remainder loop from chunk splitting (no producer pass — user attr only) }; /** @@ -156,11 +158,9 @@ inline LoopOrigin StringToLoopOrigin(const std::string& str) { * @brief Distinguishes different scope kinds */ enum class ScopeKind : uint8_t { - InCore = 0, ///< InCore scope for AICore sub-graphs - AutoInCore = 1, ///< AutoInCore scope for automatic chunking - Cluster = 2, ///< Cluster scope for co-scheduled AIC + AIV groups - Hierarchy = 3, ///< Distributed hierarchy scope (uses level_/role_ on ScopeStmt) - Spmd = 4 ///< SPMD dispatch scope (core_num/sync_start on ScopeStmt) + Cluster = 0, ///< Cluster scope for co-scheduled AIC + AIV groups + Hierarchy = 1, ///< Distributed hierarchy scope (uses level_/role_/split_ on ScopeStmt) + Spmd = 2 ///< SPMD dispatch scope (core_num/sync_start on ScopeStmt) }; /** @@ -250,14 +250,10 @@ inline ForKind StringToForKind(const std::string& str) { /** * @brief Convert ScopeKind to string * @param kind The scope kind - * @return String representation ("InCore", "AutoInCore", "Cluster", "Hierarchy", or "Spmd") + * @return String representation ("Cluster", "Hierarchy", or "Spmd") */ inline std::string ScopeKindToString(ScopeKind kind) { switch (kind) { - case ScopeKind::InCore: - return "InCore"; - case ScopeKind::AutoInCore: - return "AutoInCore"; case ScopeKind::Cluster: return "Cluster"; case ScopeKind::Hierarchy: @@ -275,11 +271,7 @@ inline std::string ScopeKindToString(ScopeKind kind) { * @throws pypto::TypeError if string is not recognized */ inline ScopeKind StringToScopeKind(const std::string& str) { - if (str == "InCore") { - return ScopeKind::InCore; - } else if (str == "AutoInCore") { - return ScopeKind::AutoInCore; - } else if (str == "Cluster") { + if (str == "Cluster") { return ScopeKind::Cluster; } else if (str == "Hierarchy") { return ScopeKind::Hierarchy; @@ -702,32 +694,33 @@ using WhileStmtPtr = std::shared_ptr; * Represents a scoped region of code with a specific execution context. * This is NOT a control flow node — it executes its body exactly once, linearly. * - * **Class hierarchy** (issue #1047): + * **Class hierarchy:** * - `ScopeStmt` (abstract): common fields `name_hint_`, `body_` - * - `InCoreScopeStmt`: optional `split_` - * - `AutoInCoreScopeStmt`: optional `split_` * - `ClusterScopeStmt`: no extra fields - * - `HierarchyScopeStmt`: required `level_`, optional `role_` + * - `HierarchyScopeStmt`: required `level_`, optional `role_`, optional `split_` + * (split_ is only valid at Level::CORE_GROUP) * - `SpmdScopeStmt`: required `core_num_`, `sync_start_` (default false) * * **Syntax:** - * with pl.incore(): # InCore scope -> InCoreScopeStmt + * with pl.cluster(): # -> ClusterScopeStmt * body - * with pl.cluster(): # Cluster scope -> ClusterScopeStmt + * with pl.at(level=pl.Level.CORE_GROUP): # -> HierarchyScopeStmt * body * with pl.at(level=pl.Level.HOST, role=pl.Role.Worker): # -> HierarchyScopeStmt * body - * with pl.spmd(core_num=8): # -> SpmdScopeStmt + * with pl.spmd(core_num=8): # -> SpmdScopeStmt * body * * **Semantics:** - * - Marks a region of code as belonging to a specific scope (e.g., InCore, Cluster) - * - Executes body exactly once (no iteration, no branching) - * - Variables flow through transparently (no iter_args/return_vars needed) - * - SSA conversion treats it as transparent (just visits body) - * - OutlineIncoreScopes extracts InCore scopes into InCore functions - * - OutlineClusterScopes extracts Cluster scopes into Group functions - * - Hierarchy scopes are outlined into level-/role-annotated functions + * - Marks a region of code as belonging to a specific scope. + * - Executes body exactly once (no iteration, no branching). + * - Variables flow through transparently (no iter_args/return_vars needed). + * - SSA conversion treats it as transparent (just visits body). + * - OutlineHierarchyScopes extracts Hierarchy scopes into level-/role-annotated + * functions. For Level::CORE_GROUP, the outlined function has + * FunctionType::InCore and the parent function is promoted to + * FunctionType::Orchestration. + * - OutlineClusterScopes extracts Cluster scopes into Group functions. */ class ScopeStmt : public Stmt { public: @@ -754,60 +747,6 @@ class ScopeStmt : public Stmt { using ScopeStmtPtr = std::shared_ptr; -/** - * @brief InCore scope: AICore sub-graph region. - * - * Carries an optional `split` for cross-core transfer mode. - */ -class InCoreScopeStmt : public ScopeStmt { - public: - InCoreScopeStmt(std::optional split, std::string name_hint, StmtPtr body, Span span, - std::vector leading_comments = {}) - : ScopeStmt(std::move(name_hint), std::move(body), std::move(span), std::move(leading_comments)), - split_(split) {} - - [[nodiscard]] ObjectKind GetKind() const override { return ObjectKind::InCoreScopeStmt; } - [[nodiscard]] ScopeKind GetScopeKind() const override { return ScopeKind::InCore; } - [[nodiscard]] std::string TypeName() const override { return "InCoreScopeStmt"; } - - static constexpr auto GetFieldDescriptors() { - return std::tuple_cat(ScopeStmt::GetFieldDescriptors(), - std::make_tuple(reflection::UsualField(&InCoreScopeStmt::split_, "split"))); - } - - public: - std::optional split_; // Split mode (nullopt or None for no split) -}; - -using InCoreScopeStmtPtr = std::shared_ptr; - -/** - * @brief AutoInCore scope: InCore region with automatic chunking. - * - * Carries an optional `split` for cross-core transfer mode. - */ -class AutoInCoreScopeStmt : public ScopeStmt { - public: - AutoInCoreScopeStmt(std::optional split, std::string name_hint, StmtPtr body, Span span, - std::vector leading_comments = {}) - : ScopeStmt(std::move(name_hint), std::move(body), std::move(span), std::move(leading_comments)), - split_(split) {} - - [[nodiscard]] ObjectKind GetKind() const override { return ObjectKind::AutoInCoreScopeStmt; } - [[nodiscard]] ScopeKind GetScopeKind() const override { return ScopeKind::AutoInCore; } - [[nodiscard]] std::string TypeName() const override { return "AutoInCoreScopeStmt"; } - - static constexpr auto GetFieldDescriptors() { - return std::tuple_cat(ScopeStmt::GetFieldDescriptors(), - std::make_tuple(reflection::UsualField(&AutoInCoreScopeStmt::split_, "split"))); - } - - public: - std::optional split_; // Split mode (nullopt or None for no split) -}; - -using AutoInCoreScopeStmtPtr = std::shared_ptr; - /** * @brief Cluster scope: co-scheduled AIC + AIV group. * @@ -831,15 +770,18 @@ using ClusterScopeStmtPtr = std::shared_ptr; /** * @brief Hierarchy scope: distributed-hierarchy region. * - * Required `level`, optional `role`. Outlined into level-/role-annotated functions. + * Required `level`, optional `role`, optional `split`. `split` is only valid + * when `level == Level::CORE_GROUP`; it carries the AIC/AIV cross-core split + * mode through to the outlined InCore function. */ class HierarchyScopeStmt : public ScopeStmt { public: - HierarchyScopeStmt(Level level, std::optional role, std::string name_hint, StmtPtr body, Span span, - std::vector leading_comments = {}) - : ScopeStmt(std::move(name_hint), std::move(body), std::move(span), std::move(leading_comments)), - level_(level), - role_(role) {} + // Out-of-line definition in src/ir/stmt.cpp so the CORE_GROUP check can see + // the full Level enum (function.h is not included from this header to avoid + // a circular dependency). + HierarchyScopeStmt(Level level, std::optional role, std::optional split, + std::string name_hint, StmtPtr body, Span span, + std::vector leading_comments = {}); [[nodiscard]] ObjectKind GetKind() const override { return ObjectKind::HierarchyScopeStmt; } [[nodiscard]] ScopeKind GetScopeKind() const override { return ScopeKind::Hierarchy; } @@ -848,12 +790,14 @@ class HierarchyScopeStmt : public ScopeStmt { static constexpr auto GetFieldDescriptors() { return std::tuple_cat(ScopeStmt::GetFieldDescriptors(), std::make_tuple(reflection::UsualField(&HierarchyScopeStmt::level_, "level"), - reflection::UsualField(&HierarchyScopeStmt::role_, "role"))); + reflection::UsualField(&HierarchyScopeStmt::role_, "role"), + reflection::UsualField(&HierarchyScopeStmt::split_, "split"))); } public: - Level level_; ///< Hierarchy level (required) - std::optional role_; ///< Function role (Orchestrator or Worker) + Level level_; ///< Hierarchy level (required) + std::optional role_; ///< Function role (Orchestrator or Worker) + std::optional split_; ///< AIC/AIV split mode (only valid at CORE_GROUP) }; using HierarchyScopeStmtPtr = std::shared_ptr; diff --git a/include/pypto/ir/transforms/base/functor.h b/include/pypto/ir/transforms/base/functor.h index 3216913bd..ceeb01b87 100644 --- a/include/pypto/ir/transforms/base/functor.h +++ b/include/pypto/ir/transforms/base/functor.h @@ -186,8 +186,6 @@ class StmtFunctor { virtual R VisitStmt_(const ReturnStmtPtr& op, Args... args) = 0; virtual R VisitStmt_(const ForStmtPtr& op, Args... args) = 0; virtual R VisitStmt_(const WhileStmtPtr& op, Args... args) = 0; - virtual R VisitStmt_(const InCoreScopeStmtPtr& op, Args... args) = 0; - virtual R VisitStmt_(const AutoInCoreScopeStmtPtr& op, Args... args) = 0; virtual R VisitStmt_(const ClusterScopeStmtPtr& op, Args... args) = 0; virtual R VisitStmt_(const HierarchyScopeStmtPtr& op, Args... args) = 0; virtual R VisitStmt_(const SpmdScopeStmtPtr& op, Args... args) = 0; @@ -213,8 +211,6 @@ R StmtFunctor::VisitStmt(const StmtPtr& stmt, Args... args) { STMT_FUNCTOR_DISPATCH(ReturnStmt); STMT_FUNCTOR_DISPATCH(ForStmt); STMT_FUNCTOR_DISPATCH(WhileStmt); - STMT_FUNCTOR_DISPATCH(InCoreScopeStmt); - STMT_FUNCTOR_DISPATCH(AutoInCoreScopeStmt); STMT_FUNCTOR_DISPATCH(ClusterScopeStmt); STMT_FUNCTOR_DISPATCH(HierarchyScopeStmt); STMT_FUNCTOR_DISPATCH(SpmdScopeStmt); diff --git a/include/pypto/ir/transforms/base/mutator.h b/include/pypto/ir/transforms/base/mutator.h index 69ccc051c..712db545d 100644 --- a/include/pypto/ir/transforms/base/mutator.h +++ b/include/pypto/ir/transforms/base/mutator.h @@ -95,8 +95,6 @@ class IRMutator : public ExprFunctor, public StmtFunctor { StmtPtr VisitStmt_(const ReturnStmtPtr& op) override; StmtPtr VisitStmt_(const ForStmtPtr& op) override; StmtPtr VisitStmt_(const WhileStmtPtr& op) override; - StmtPtr VisitStmt_(const InCoreScopeStmtPtr& op) override; - StmtPtr VisitStmt_(const AutoInCoreScopeStmtPtr& op) override; StmtPtr VisitStmt_(const ClusterScopeStmtPtr& op) override; StmtPtr VisitStmt_(const HierarchyScopeStmtPtr& op) override; StmtPtr VisitStmt_(const SpmdScopeStmtPtr& op) override; diff --git a/include/pypto/ir/transforms/base/visitor.h b/include/pypto/ir/transforms/base/visitor.h index 41d4b5a7c..5f24c47c3 100644 --- a/include/pypto/ir/transforms/base/visitor.h +++ b/include/pypto/ir/transforms/base/visitor.h @@ -98,8 +98,6 @@ class IRVisitor : public IRFunctor { void VisitStmt_(const ReturnStmtPtr& op) override; void VisitStmt_(const ForStmtPtr& op) override; void VisitStmt_(const WhileStmtPtr& op) override; - void VisitStmt_(const InCoreScopeStmtPtr& op) override; - void VisitStmt_(const AutoInCoreScopeStmtPtr& op) override; void VisitStmt_(const ClusterScopeStmtPtr& op) override; void VisitStmt_(const HierarchyScopeStmtPtr& op) override; void VisitStmt_(const SpmdScopeStmtPtr& op) override; diff --git a/include/pypto/ir/transforms/ir_property.h b/include/pypto/ir/transforms/ir_property.h index a2b62dc48..6c3acba08 100644 --- a/include/pypto/ir/transforms/ir_property.h +++ b/include/pypto/ir/transforms/ir_property.h @@ -35,7 +35,6 @@ enum class IRProperty : uint64_t { NoNestedCalls, ///< No nested call expressions NormalizedStmtStructure, ///< Statement structure normalized NoRedundantBlocks, ///< No single-child or nested SeqStmts - SplitIncoreOrch, ///< InCore scopes outlined into separate functions HasMemRefs, ///< MemRef objects initialized on variables IncoreTileOps, ///< InCore functions use tile ops (tile types, load/store) AllocatedMemoryAddr, ///< All MemRefs have valid addresses within buffer limits @@ -45,11 +44,10 @@ enum class IRProperty : uint64_t { TileMemoryInferred, ///< TileType memory_space_ populated in InCore functions BreakContinueValid, ///< Break/continue only in sequential/while loops UseAfterDef, ///< All variable uses are dominated by a definition - HierarchyOutlined, ///< Hierarchy scopes outlined into level/role functions + HierarchyOutlined, ///< Hierarchy scopes outlined into level/role functions (CORE_GROUP→InCore funcs) StructuredCtrlFlow, ///< No BreakStmt/ContinueStmt — only structured control flow VectorKernelSplit, ///< AIV functions with split mode have tpop shapes and store offsets adjusted OutParamNotShadowed, ///< Out/InOut params are not reassigned with tensor-creating ops - NoNestedInCore, ///< No nested InCore scopes (ScopeStmt inside ScopeStmt) InOutUseValid, ///< No reads of InOut/Out-passed variables after the call (RFC #1026) kCount ///< Sentinel (must be last) }; @@ -191,7 +189,7 @@ const IRPropertySet& GetVerifiedProperties(); * * These are verified automatically at pipeline start and never declared * in per-pass PassProperties. Returns {TypeChecked, BreakContinueValid, - * NoRedundantBlocks, UseAfterDef, OutParamNotShadowed, NoNestedInCore}. + * NoRedundantBlocks, UseAfterDef, OutParamNotShadowed}. */ const IRPropertySet& GetStructuralProperties(); @@ -199,7 +197,7 @@ const IRPropertySet& GetStructuralProperties(); * @brief Default property set for explicit verification * * Returns {SSAForm, TypeChecked, NoNestedCalls, BreakContinueValid, - * NoRedundantBlocks, UseAfterDef, OutParamNotShadowed, NoNestedInCore} — the properties checked by + * NoRedundantBlocks, UseAfterDef, OutParamNotShadowed} — the properties checked by * run_verifier() when no explicit set is given. */ const IRPropertySet& GetDefaultVerifyProperties(); diff --git a/include/pypto/ir/transforms/pass_properties.h b/include/pypto/ir/transforms/pass_properties.h index 8c5063347..9e092e3da 100644 --- a/include/pypto/ir/transforms/pass_properties.h +++ b/include/pypto/ir/transforms/pass_properties.h @@ -32,18 +32,6 @@ inline const PassProperties kUnrollLoopsProperties{}; inline const PassProperties kCtrlFlowTransformProperties{.produced = {IRProperty::StructuredCtrlFlow}}; -// -- Loop chunking pass (runs after SSA) -------------------------------------- - -inline const PassProperties kSplitChunkedLoopsProperties{ - .required = {IRProperty::SSAForm, IRProperty::NormalizedStmtStructure}, - .produced = {IRProperty::SSAForm, IRProperty::NormalizedStmtStructure}}; - -// -- Chunk loop interchange pass (runs after SplitChunkedLoops) --------------- - -inline const PassProperties kInterchangeChunkLoopsProperties{ - .required = {IRProperty::SSAForm, IRProperty::NormalizedStmtStructure}, - .produced = {IRProperty::SSAForm, IRProperty::NormalizedStmtStructure}}; - // -- SSA conversion pass ------------------------------------------------------ inline const PassProperties kConvertToSSAProperties{.produced = {IRProperty::SSAForm}, @@ -62,32 +50,40 @@ inline const PassProperties kNormalizeStmtStructureProperties{ inline const PassProperties kSimplifyProperties{}; -// -- Outlining pass ----------------------------------------------------------- - -inline const PassProperties kOutlineIncoreScopesProperties{ - .required = {IRProperty::SSAForm}, .produced = {IRProperty::SSAForm, IRProperty::SplitIncoreOrch}}; - // -- Cluster outlining pass --------------------------------------------------- inline const PassProperties kOutlineClusterScopesProperties{ .required = {IRProperty::SSAForm}, .produced = {IRProperty::SSAForm, IRProperty::ClusterOutlined}}; -// -- Hierarchy outlining pass ------------------------------------------------- +// -- Hierarchy outlining passes ----------------------------------------------- +// +// Hierarchy outlining is split between two passes that share the +// `HierarchyOutlined` property: +// - OutlineHierarchyScopes outlines every HierarchyScopeStmt with +// `level_ != CORE_GROUP` into Opaque functions. CORE_GROUP scopes are +// preserved verbatim for the next pass. +// - OutlineIncoreScopes outlines the remaining CORE_GROUP HierarchyScopeStmts +// into InCore functions and promotes the parent function from Opaque to +// Orchestration. It produces `HierarchyOutlined` (no Hierarchy scopes +// remain in Opaque/Orchestration bodies after both passes have run). + +inline const PassProperties kOutlineHierarchyScopesProperties{.required = {IRProperty::SSAForm}, + .produced = {IRProperty::SSAForm}}; -inline const PassProperties kOutlineHierarchyScopesProperties{ +inline const PassProperties kOutlineIncoreScopesProperties{ .required = {IRProperty::SSAForm}, .produced = {IRProperty::SSAForm, IRProperty::HierarchyOutlined}}; // -- Tensor-to-tile conversion pass ------------------------------------------ inline const PassProperties kConvertTensorToTileOpsProperties{ - .required = {IRProperty::SSAForm, IRProperty::SplitIncoreOrch, IRProperty::NormalizedStmtStructure}, + .required = {IRProperty::SSAForm, IRProperty::HierarchyOutlined, IRProperty::NormalizedStmtStructure}, .produced = {IRProperty::SSAForm, IRProperty::IncoreTileOps, IRProperty::NormalizedStmtStructure}}; // -- Orchestration tensor optimization pass ----------------------------------- inline const PassProperties kOptimizeOrchTensorsProperties{ - .required = {IRProperty::SplitIncoreOrch, IRProperty::IncoreTileOps}, - .produced = {IRProperty::SplitIncoreOrch, IRProperty::IncoreTileOps}}; + .required = {IRProperty::HierarchyOutlined, IRProperty::IncoreTileOps}, + .produced = {IRProperty::HierarchyOutlined, IRProperty::IncoreTileOps}}; // -- Tile ND-to-2D flattening pass -------------------------------------------- @@ -98,31 +94,31 @@ inline const PassProperties kFlattenTileNdTo2DProperties{ // -- Tile memory space inference pass ----------------------------------------- inline const PassProperties kInferTileMemorySpaceProperties{ - .required = {IRProperty::SSAForm, IRProperty::IncoreTileOps, IRProperty::SplitIncoreOrch, + .required = {IRProperty::SSAForm, IRProperty::IncoreTileOps, IRProperty::HierarchyOutlined, IRProperty::NormalizedStmtStructure}, .produced = {IRProperty::SSAForm, IRProperty::TileMemoryInferred, IRProperty::NormalizedStmtStructure}}; // -- Resolve transpose layout pass -------------------------------------------- inline const PassProperties kResolveTransposeLayoutProperties{ - .required = {IRProperty::SSAForm, IRProperty::IncoreTileOps, IRProperty::SplitIncoreOrch, + .required = {IRProperty::SSAForm, IRProperty::IncoreTileOps, IRProperty::HierarchyOutlined, IRProperty::TileOps2D}, - .produced = {IRProperty::SSAForm, IRProperty::IncoreTileOps, IRProperty::SplitIncoreOrch, + .produced = {IRProperty::SSAForm, IRProperty::IncoreTileOps, IRProperty::HierarchyOutlined, IRProperty::TileOps2D}}; // -- Resolve backend op layouts pass ------------------------------------------ inline const PassProperties kResolveBackendOpLayoutsProperties{ - .required = {IRProperty::SSAForm, IRProperty::IncoreTileOps, IRProperty::SplitIncoreOrch, + .required = {IRProperty::SSAForm, IRProperty::IncoreTileOps, IRProperty::HierarchyOutlined, IRProperty::TileOps2D}, - .produced = {IRProperty::SSAForm, IRProperty::IncoreTileOps, IRProperty::SplitIncoreOrch, + .produced = {IRProperty::SSAForm, IRProperty::IncoreTileOps, IRProperty::HierarchyOutlined, IRProperty::TileOps2D}, .invalidated = {IRProperty::NormalizedStmtStructure}}; // -- Mixed kernel expansion pass ---------------------------------------------- inline const PassProperties kExpandMixedKernelProperties{ - .required = {IRProperty::SSAForm, IRProperty::IncoreTileOps, IRProperty::SplitIncoreOrch, + .required = {IRProperty::SSAForm, IRProperty::IncoreTileOps, IRProperty::HierarchyOutlined, IRProperty::TileOps2D, IRProperty::TileMemoryInferred, IRProperty::NormalizedStmtStructure}, .produced = {IRProperty::SSAForm, IRProperty::MixedKernelExpanded, IRProperty::NormalizedStmtStructure}}; @@ -135,42 +131,42 @@ inline const PassProperties kSplitVectorKernelProperties{ // -- Memory / codegen passes -------------------------------------------------- inline const PassProperties kInitMemRefProperties{ - .required = {IRProperty::SSAForm, IRProperty::SplitIncoreOrch, IRProperty::IncoreTileOps, + .required = {IRProperty::SSAForm, IRProperty::HierarchyOutlined, IRProperty::IncoreTileOps, IRProperty::TileOps2D, IRProperty::TileMemoryInferred}, .produced = {IRProperty::HasMemRefs, IRProperty::NormalizedStmtStructure}, .invalidated = {IRProperty::SSAForm}}; inline const PassProperties kMemoryReuseProperties{ - .required = {IRProperty::SplitIncoreOrch, IRProperty::IncoreTileOps, IRProperty::HasMemRefs, + .required = {IRProperty::HierarchyOutlined, IRProperty::IncoreTileOps, IRProperty::HasMemRefs, IRProperty::TileOps2D, IRProperty::NormalizedStmtStructure}, .produced = {IRProperty::NormalizedStmtStructure}}; inline const PassProperties kInsertSyncProperties{ - .required = {IRProperty::SplitIncoreOrch, IRProperty::IncoreTileOps, IRProperty::HasMemRefs, + .required = {IRProperty::HierarchyOutlined, IRProperty::IncoreTileOps, IRProperty::HasMemRefs, IRProperty::TileOps2D}}; inline const PassProperties kAllocateMemoryAddrProperties{ - .required = {IRProperty::SplitIncoreOrch, IRProperty::IncoreTileOps, IRProperty::HasMemRefs, + .required = {IRProperty::HierarchyOutlined, IRProperty::IncoreTileOps, IRProperty::HasMemRefs, IRProperty::TileOps2D}, .produced = {IRProperty::AllocatedMemoryAddr}}; // -- Return order normalization pass ------------------------------------------ inline const PassProperties kNormalizeReturnOrderProperties{ - .required = {IRProperty::SplitIncoreOrch, IRProperty::IncoreTileOps}}; + .required = {IRProperty::HierarchyOutlined, IRProperty::IncoreTileOps}}; // -- Partial unroll + reorder passes (tile-level, before InitMemRef) --------- inline const PassProperties kPartialUnrollTileLoopsProperties{ - .required = {IRProperty::SSAForm, IRProperty::SplitIncoreOrch, IRProperty::IncoreTileOps, + .required = {IRProperty::SSAForm, IRProperty::HierarchyOutlined, IRProperty::IncoreTileOps, IRProperty::TileOps2D, IRProperty::TileMemoryInferred, IRProperty::NormalizedStmtStructure}, - .produced = {IRProperty::SSAForm, IRProperty::SplitIncoreOrch, IRProperty::IncoreTileOps, + .produced = {IRProperty::SSAForm, IRProperty::HierarchyOutlined, IRProperty::IncoreTileOps, IRProperty::TileOps2D, IRProperty::TileMemoryInferred, IRProperty::NormalizedStmtStructure}}; inline const PassProperties kReorderUnrolledIOProperties{ - .required = {IRProperty::SSAForm, IRProperty::SplitIncoreOrch, IRProperty::IncoreTileOps, + .required = {IRProperty::SSAForm, IRProperty::HierarchyOutlined, IRProperty::IncoreTileOps, IRProperty::TileOps2D, IRProperty::TileMemoryInferred, IRProperty::NormalizedStmtStructure}, - .produced = {IRProperty::SSAForm, IRProperty::SplitIncoreOrch, IRProperty::IncoreTileOps, + .produced = {IRProperty::SSAForm, IRProperty::HierarchyOutlined, IRProperty::IncoreTileOps, IRProperty::TileOps2D, IRProperty::TileMemoryInferred, IRProperty::NormalizedStmtStructure}}; } // namespace pass diff --git a/include/pypto/ir/transforms/passes.h b/include/pypto/ir/transforms/passes.h index abbf21f1b..400846a09 100644 --- a/include/pypto/ir/transforms/passes.h +++ b/include/pypto/ir/transforms/passes.h @@ -182,25 +182,6 @@ Pass InsertSync(); */ Pass AllocateMemoryAddr(); -/** - * @brief Create a loop chunking pass - * - * Splits ForStmt nodes with chunk_size into nested loops: an outer loop - * iterating over chunk indices and an inner loop iterating within each chunk. - * Requires SSA form input and produces SSA form output. - */ -Pass SplitChunkedLoops(); - -/** - * @brief Interchange chunk loops and insert InCore scopes - * - * Reorders nested ChunkOuter/ChunkInner loop pairs so that all outer loops - * are on top, then wraps the inner loops + body in a ScopeStmt(InCore). - * Only interchanges when all ChunkInner loops are Parallel. - * Requires SSA form input and produces SSA form output. - */ -Pass InterchangeChunkLoops(); - /** * @brief Create a loop unrolling pass * @@ -263,23 +244,35 @@ Pass CtrlFlowTransform(); Pass ConvertToSSA(); /** - * @brief Outline InCore scopes into separate functions + * @brief Outline non-CORE_GROUP Hierarchy scopes into separate Opaque functions + * + * Outlines every `HierarchyScopeStmt` whose `level_` is anything other than + * `CORE_GROUP`, carrying the scope's level/role onto the outlined function. The + * parent function's type is preserved (it stays `Opaque`). CORE_GROUP scopes + * survive this pass for `OutlineIncoreScopes` to handle. * * Requirements: * - Input IR must be in SSA form (run ConvertToSSA first) - * - Only processes Opaque functions + * - Only processes Opaque functions containing Hierarchy scopes + * - Should run before OutlineIncoreScopes and OutlineClusterScopes */ -Pass OutlineIncoreScopes(); +Pass OutlineHierarchyScopes(); /** - * @brief Outline Hierarchy scopes into separate functions with level/role + * @brief Outline CORE_GROUP Hierarchy scopes into InCore functions + * + * Outlines every `HierarchyScopeStmt(level=CORE_GROUP)` into a separate + * `Function(InCore)` and promotes the parent function from `Opaque` to + * `Orchestration` when any CORE_GROUP scope was outlined. Together with + * `OutlineHierarchyScopes`, establishes the `HierarchyOutlined` property: no + * `HierarchyScopeStmt` remains in any Opaque/Orchestration function body. * * Requirements: * - Input IR must be in SSA form (run ConvertToSSA first) - * - Only processes Opaque functions containing Hierarchy scopes - * - Should run before OutlineIncoreScopes and OutlineClusterScopes + * - Should run after OutlineHierarchyScopes and before OutlineClusterScopes + * - Only processes Opaque functions */ -Pass OutlineHierarchyScopes(); +Pass OutlineIncoreScopes(); /** * @brief Outline Cluster scopes into separate Group functions @@ -298,7 +291,7 @@ Pass OutlineClusterScopes(); * orchestration call sites with tensor.create for output parameters. * * Requirements: - * - Input IR must have InCore scopes outlined (run OutlineIncoreScopes first) + * - Input IR must have InCore functions outlined (run OutlineHierarchyScopes first) */ Pass ConvertTensorToTileOps(); @@ -363,7 +356,7 @@ Pass InferTileMemorySpace(); * * Requirements: * - Input IR must have tile ops (run ConvertTensorToTileOps first) - * - Input IR must have InCore scopes outlined (run OutlineIncoreScopes first) + * - Input IR must have InCore functions outlined (run OutlineHierarchyScopes first) */ Pass ResolveTransposeLayout(); @@ -386,7 +379,7 @@ Pass ResolveBackendOpLayouts(); * * Requirements: * - Input IR must have tile ops (run ConvertTensorToTileOps first) - * - Input IR must have InCore scopes outlined (run OutlineIncoreScopes first) + * - Input IR must have InCore functions outlined (run OutlineHierarchyScopes first) */ Pass ExpandMixedKernel(); diff --git a/include/pypto/ir/transforms/utils/scope_outline_utils.h b/include/pypto/ir/transforms/utils/scope_outline_utils.h index 2a52d6fed..d60d9e1be 100644 --- a/include/pypto/ir/transforms/utils/scope_outline_utils.h +++ b/include/pypto/ir/transforms/utils/scope_outline_utils.h @@ -199,13 +199,29 @@ class VarCollector : public IRVisitor { * and a naming suffix. Handles SeqStmts specially to determine which scope-defined * variables are actually used after each scope (output filtering), and recursively * transforms scope bodies to handle nested scopes. + * + * For HierarchyScopeStmt, an optional `level_filter_` narrows which scopes are + * outlined: when set with mode `Only`, only scopes whose `level_` matches are + * outlined; when set with mode `Exclude`, scopes at the matching level are + * skipped (and the mutator descends into their body to outline nested scopes + * normally). Used to split outlining into two passes: `OutlineHierarchyScopes` + * (excludes CORE_GROUP, emits Opaque) and `OutlineIncoreScopes` (only + * CORE_GROUP, emits InCore). */ class ScopeOutliner : public IRMutator { public: + /// Hierarchy-level filter for ScopeOutliner. + struct HierarchyLevelFilter { + enum class Mode { Only, Exclude }; + Level level; + Mode mode; + }; + ScopeOutliner(std::string func_name, const std::unordered_map& var_types, const std::unordered_map& var_objects, const std::unordered_set& known_names, ScopeKind target_scope_kind, - FunctionType outlined_func_type, std::string name_suffix, ProgramPtr program = nullptr) + FunctionType outlined_func_type, std::string name_suffix, ProgramPtr program = nullptr, + std::optional level_filter = std::nullopt) : func_name_(std::move(func_name)), var_types_(var_types), var_objects_(var_objects), @@ -213,7 +229,8 @@ class ScopeOutliner : public IRMutator { target_scope_kind_(target_scope_kind), outlined_func_type_(outlined_func_type), name_suffix_(std::move(name_suffix)), - program_(std::move(program)) {} + program_(std::move(program)), + level_filter_(level_filter) {} [[nodiscard]] const std::vector& GetOutlinedFunctions() const { return outlined_functions_; } @@ -244,7 +261,7 @@ class ScopeOutliner : public IRMutator { for (size_t i = 0; i < op->stmts_.size(); ++i) { auto scope = std::dynamic_pointer_cast(op->stmts_[i]); - if (scope && scope->GetScopeKind() == target_scope_kind_) { + if (scope && scope->GetScopeKind() == target_scope_kind_ && ShouldOutline(scope)) { // Collect variables referenced in all subsequent statements VarDefUseCollector after_ref_collector; for (size_t j = i + 1; j < op->stmts_.size(); ++j) { @@ -300,6 +317,9 @@ class ScopeOutliner : public IRMutator { if (op->GetScopeKind() != target_scope_kind_) { return IRMutator::VisitStmt_(op); } + if (!ShouldOutline(op)) { + return IRMutator::VisitStmt_(op); + } VarDefUseCollector def_collector; def_collector.VisitStmt(op->body_); StoreTargetCollector store_collector; @@ -308,12 +328,21 @@ class ScopeOutliner : public IRMutator { return OutlineScope(op, def_collector.var_defs); } - StmtPtr VisitStmt_(const InCoreScopeStmtPtr& op) override { return VisitScopeKind(op); } - StmtPtr VisitStmt_(const AutoInCoreScopeStmtPtr& op) override { return VisitScopeKind(op); } StmtPtr VisitStmt_(const ClusterScopeStmtPtr& op) override { return VisitScopeKind(op); } StmtPtr VisitStmt_(const HierarchyScopeStmtPtr& op) override { return VisitScopeKind(op); } StmtPtr VisitStmt_(const SpmdScopeStmtPtr& op) override { return VisitScopeKind(op); } + /// Apply the optional hierarchy-level filter. Non-Hierarchy scopes are + /// unaffected; Hierarchy scopes are matched against `level_filter_.level` + /// and accepted/rejected per `level_filter_.mode`. + bool ShouldOutline(const ScopeStmtPtr& op) const { + if (!level_filter_.has_value()) return true; + auto hier = As(op); + if (!hier) return true; + bool matches = (hier->level_ == level_filter_->level); + return level_filter_->mode == HierarchyLevelFilter::Mode::Only ? matches : !matches; + } + private: /** * @brief Outline a single scope into a separate function. @@ -540,18 +569,12 @@ class ScopeOutliner : public IRMutator { outlined_body = std::make_shared(body_stmts, op->span_); } - // Register the outlined function (propagate level/role from ScopeStmt, convert split/core_num to attrs) + // Register the outlined function (propagate level/role/split from ScopeStmt, convert split/core_num to + // attrs). When outlining a HierarchyScopeStmt at Level::CORE_GROUP, the outlined function becomes + // FunctionType::InCore regardless of the default outlined_func_type_ — this replaces the former + // OutlineIncoreScopes pass. std::vector> outlined_attrs; - auto append_split_attr = [&](std::optional split) { - if (split.has_value() && split.value() != SplitMode::None) { - outlined_attrs.emplace_back("split", static_cast(split.value())); - } - }; - if (auto incore = As(op)) { - append_split_attr(incore->split_); - } else if (auto auto_incore = As(op)) { - append_split_attr(auto_incore->split_); - } else if (auto spmd = As(op)) { + if (auto spmd = As(op)) { outlined_attrs.emplace_back("core_num", spmd->core_num_); if (spmd->sync_start_) { outlined_attrs.emplace_back("sync_start", true); @@ -562,6 +585,9 @@ class ScopeOutliner : public IRMutator { if (auto hier = As(op)) { outlined_level = hier->level_; outlined_role = hier->role_; + if (hier->split_.has_value() && hier->split_.value() != SplitMode::None) { + outlined_attrs.emplace_back("split", static_cast(hier->split_.value())); + } } auto outlined_func = std::make_shared( outlined_func_name, input_params, input_param_directions, return_types, outlined_body, op->span_, @@ -793,6 +819,7 @@ class ScopeOutliner : public IRMutator { FunctionType outlined_func_type_; std::string name_suffix_; ProgramPtr program_; + std::optional level_filter_; int scope_counter_ = 0; std::vector outlined_functions_; }; @@ -807,7 +834,7 @@ class ScopeOutliner : public IRMutator { /// have been successfully outlined into separate functions. /// /// Usage: -/// ScopeKindAbsenceVerifier verifier(diagnostics, "PassName", "error message"); +/// ScopeKindAbsenceVerifier verifier(diagnostics, "PassName", "error message"); /// verifier.VisitStmt(func->body_); template class ScopeKindAbsenceVerifier : public IRVisitor { @@ -824,8 +851,6 @@ class ScopeKindAbsenceVerifier : public IRVisitor { IRVisitor::VisitStmt_(op); } - void VisitStmt_(const InCoreScopeStmtPtr& op) override { CheckKind(op); } - void VisitStmt_(const AutoInCoreScopeStmtPtr& op) override { CheckKind(op); } void VisitStmt_(const ClusterScopeStmtPtr& op) override { CheckKind(op); } void VisitStmt_(const HierarchyScopeStmtPtr& op) override { CheckKind(op); } void VisitStmt_(const SpmdScopeStmtPtr& op) override { CheckKind(op); } diff --git a/include/pypto/ir/verifier/verifier.h b/include/pypto/ir/verifier/verifier.h index 2fcb11603..59bc00b8e 100644 --- a/include/pypto/ir/verifier/verifier.h +++ b/include/pypto/ir/verifier/verifier.h @@ -106,12 +106,6 @@ PropertyVerifierPtr CreateNormalizedStmtPropertyVerifier(); */ PropertyVerifierPtr CreateNoRedundantBlocksPropertyVerifier(); -/** - * @brief Factory function for creating SplitIncoreOrch property verifier - * @return Shared pointer to SplitIncoreOrch PropertyVerifier - */ -PropertyVerifierPtr CreateSplitIncoreOrchPropertyVerifier(); - /** * @brief Factory function for creating ClusterOutlined property verifier * @return Shared pointer to ClusterOutlined PropertyVerifier @@ -213,14 +207,6 @@ PropertyVerifierPtr CreateStructuredCtrlFlowPropertyVerifier(); */ PropertyVerifierPtr CreateOutParamNotShadowedPropertyVerifier(); -/** - * @brief Factory function for creating NoNestedInCore property verifier - * - * Verifies that no ScopeStmt(InCore) is nested inside another ScopeStmt(InCore). - * @return Shared pointer to NoNestedInCore PropertyVerifier - */ -PropertyVerifierPtr CreateNoNestedIncorePropertyVerifier(); - /** * @brief Factory function for creating InOutUseValid property verifier * diff --git a/pyproject.toml b/pyproject.toml index 5933663e1..315b653ee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -81,8 +81,6 @@ fixable = ["ALL"] "tests/ut/ir/transforms/test_init_memref.py" = ["E501"] "tests/ut/ir/transforms/test_memory_reuse.py" = ["E501"] "tests/ut/ir/transforms/test_infer_tile_memory_space.py" = ["E501"] -"tests/ut/ir/transforms/test_interchange_chunk_loops.py" = ["E501", "F841"] -"tests/ut/ir/transforms/test_outline_incore_interleaved_ops.py" = ["E501"] "tests/ut/ir/transforms/test_legalize_pto_buffer_reuse.py" = ["E501", "F841"] # IR dumps are formatted at 200-col for readability — suppress line-length lint "build_output/**" = ["E501"] diff --git a/python/bindings/modules/functor.cpp b/python/bindings/modules/functor.cpp index 0c65960f0..117abbb6e 100644 --- a/python/bindings/modules/functor.cpp +++ b/python/bindings/modules/functor.cpp @@ -50,7 +50,7 @@ using namespace pypto::ir; // NOLINT(build/namespaces) // --- IRVisitor trampoline --- struct PyIRVisitor : IRVisitor { - NB_TRAMPOLINE(IRVisitor, 59); // 31 base + 23 binary + 5 unary (5 scope kinds) + NB_TRAMPOLINE(IRVisitor, 57); // 29 base + 23 binary + 5 unary (3 scope kinds) // Top-level entry points void VisitProgram(const ProgramPtr& p) override { NB_OVERRIDE_NAME("visit_program", VisitProgram, p); } @@ -128,8 +128,6 @@ struct PyIRVisitor : IRVisitor { VISITOR_STMT_TRAMPOLINE(IfStmt, visit_if_stmt) VISITOR_STMT_TRAMPOLINE(ForStmt, visit_for_stmt) VISITOR_STMT_TRAMPOLINE(WhileStmt, visit_while_stmt) - VISITOR_STMT_TRAMPOLINE(InCoreScopeStmt, visit_in_core_scope_stmt) - VISITOR_STMT_TRAMPOLINE(AutoInCoreScopeStmt, visit_auto_in_core_scope_stmt) VISITOR_STMT_TRAMPOLINE(ClusterScopeStmt, visit_cluster_scope_stmt) VISITOR_STMT_TRAMPOLINE(HierarchyScopeStmt, visit_hierarchy_scope_stmt) VISITOR_STMT_TRAMPOLINE(SpmdScopeStmt, visit_spmd_scope_stmt) @@ -143,7 +141,7 @@ struct PyIRVisitor : IRVisitor { // --- IRMutator trampoline --- struct PyIRMutator : IRMutator { - NB_TRAMPOLINE(IRMutator, 58); // 30 base + 23 binary + 5 unary (5 scope kinds) + NB_TRAMPOLINE(IRMutator, 56); // 28 base + 23 binary + 5 unary (3 scope kinds) // Top-level entry points ProgramPtr VisitProgram(const ProgramPtr& p) override { @@ -222,8 +220,6 @@ struct PyIRMutator : IRMutator { MUTATOR_STMT_TRAMPOLINE(IfStmt, visit_if_stmt) MUTATOR_STMT_TRAMPOLINE(ForStmt, visit_for_stmt) MUTATOR_STMT_TRAMPOLINE(WhileStmt, visit_while_stmt) - MUTATOR_STMT_TRAMPOLINE(InCoreScopeStmt, visit_in_core_scope_stmt) - MUTATOR_STMT_TRAMPOLINE(AutoInCoreScopeStmt, visit_auto_in_core_scope_stmt) MUTATOR_STMT_TRAMPOLINE(ClusterScopeStmt, visit_cluster_scope_stmt) MUTATOR_STMT_TRAMPOLINE(HierarchyScopeStmt, visit_hierarchy_scope_stmt) MUTATOR_STMT_TRAMPOLINE(SpmdScopeStmt, visit_spmd_scope_stmt) @@ -350,8 +346,6 @@ void BindFunctor(nb::module_& m) { BIND_VISITOR(visitor_cls, IfStmt, visit_if_stmt); BIND_VISITOR(visitor_cls, ForStmt, visit_for_stmt); BIND_VISITOR(visitor_cls, WhileStmt, visit_while_stmt); - BIND_VISITOR(visitor_cls, InCoreScopeStmt, visit_in_core_scope_stmt); - BIND_VISITOR(visitor_cls, AutoInCoreScopeStmt, visit_auto_in_core_scope_stmt); BIND_VISITOR(visitor_cls, ClusterScopeStmt, visit_cluster_scope_stmt); BIND_VISITOR(visitor_cls, HierarchyScopeStmt, visit_hierarchy_scope_stmt); BIND_VISITOR(visitor_cls, SpmdScopeStmt, visit_spmd_scope_stmt); @@ -445,8 +439,6 @@ void BindFunctor(nb::module_& m) { BIND_MUTATOR(mutator_cls, IfStmt, visit_if_stmt); BIND_MUTATOR(mutator_cls, ForStmt, visit_for_stmt); BIND_MUTATOR(mutator_cls, WhileStmt, visit_while_stmt); - BIND_MUTATOR(mutator_cls, InCoreScopeStmt, visit_in_core_scope_stmt); - BIND_MUTATOR(mutator_cls, AutoInCoreScopeStmt, visit_auto_in_core_scope_stmt); BIND_MUTATOR(mutator_cls, ClusterScopeStmt, visit_cluster_scope_stmt); BIND_MUTATOR(mutator_cls, HierarchyScopeStmt, visit_hierarchy_scope_stmt); BIND_MUTATOR(mutator_cls, SpmdScopeStmt, visit_spmd_scope_stmt); diff --git a/python/bindings/modules/ir.cpp b/python/bindings/modules/ir.cpp index c85fb6ab6..fc72bf72f 100644 --- a/python/bindings/modules/ir.cpp +++ b/python/bindings/modules/ir.cpp @@ -927,10 +927,8 @@ void BindIR(nb::module_& m) { // ScopeKind enum nb::enum_(ir, "ScopeKind", "Scope kind classification") - .value("InCore", ScopeKind::InCore, "InCore scope for AICore sub-graphs") - .value("AutoInCore", ScopeKind::AutoInCore, "AutoInCore scope for automatic chunking") .value("Cluster", ScopeKind::Cluster, "Cluster scope for co-scheduled AIC + AIV groups") - .value("Hierarchy", ScopeKind::Hierarchy, "Distributed hierarchy scope (uses level/role)") + .value("Hierarchy", ScopeKind::Hierarchy, "Distributed hierarchy scope (uses level/role/split)") .value("Spmd", ScopeKind::Spmd, "SPMD dispatch scope (core_num/sync_start)") .export_values(); @@ -947,23 +945,6 @@ void BindIR(nb::module_& m) { scope_stmt_class.def_prop_ro("scope_kind", &ScopeStmt::GetScopeKind, "Discriminator for the scope kind"); BindFields(scope_stmt_class); // exposes name_hint, body - // InCoreScopeStmt - auto in_core_scope_stmt_class = - nb::class_(ir, "InCoreScopeStmt", "InCore scope: AICore sub-graph region"); - in_core_scope_stmt_class.def(nb::init, std::string, const StmtPtr&, const Span&>(), - nb::arg("split") = nb::none(), nb::arg("name_hint") = "", nb::arg("body"), - nb::arg("span"), "Create an InCore scope statement"); - BindFields(in_core_scope_stmt_class); - - // AutoInCoreScopeStmt - auto auto_in_core_scope_stmt_class = nb::class_( - ir, "AutoInCoreScopeStmt", "AutoInCore scope: InCore region with automatic chunking"); - auto_in_core_scope_stmt_class.def( - nb::init, std::string, const StmtPtr&, const Span&>(), - nb::arg("split") = nb::none(), nb::arg("name_hint") = "", nb::arg("body"), nb::arg("span"), - "Create an AutoInCore scope statement"); - BindFields(auto_in_core_scope_stmt_class); - // ClusterScopeStmt auto cluster_scope_stmt_class = nb::class_( ir, "ClusterScopeStmt", "Cluster scope: co-scheduled AIC + AIV group"); @@ -975,10 +956,11 @@ void BindIR(nb::module_& m) { // HierarchyScopeStmt auto hierarchy_scope_stmt_class = nb::class_( ir, "HierarchyScopeStmt", "Hierarchy scope: distributed-hierarchy region"); - hierarchy_scope_stmt_class.def( - nb::init, std::string, const StmtPtr&, const Span&>(), nb::arg("level"), - nb::arg("role") = nb::none(), nb::arg("name_hint") = "", nb::arg("body"), nb::arg("span"), - "Create a Hierarchy scope statement"); + hierarchy_scope_stmt_class.def(nb::init, std::optional, std::string, + const StmtPtr&, const Span&>(), + nb::arg("level"), nb::arg("role") = nb::none(), + nb::arg("split") = nb::none(), nb::arg("name_hint") = "", nb::arg("body"), + nb::arg("span"), "Create a Hierarchy scope statement"); BindFields(hierarchy_scope_stmt_class); // SpmdScopeStmt diff --git a/python/bindings/modules/passes.cpp b/python/bindings/modules/passes.cpp index a59c36e13..0d6b382ed 100644 --- a/python/bindings/modules/passes.cpp +++ b/python/bindings/modules/passes.cpp @@ -48,7 +48,6 @@ void BindPass(nb::module_& m) { .value("NoNestedCalls", IRProperty::NoNestedCalls, "No nested call expressions") .value("NormalizedStmtStructure", IRProperty::NormalizedStmtStructure, "Statement structure normalized") .value("NoRedundantBlocks", IRProperty::NoRedundantBlocks, "No single-child or nested SeqStmts") - .value("SplitIncoreOrch", IRProperty::SplitIncoreOrch, "InCore scopes outlined into separate functions") .value("HasMemRefs", IRProperty::HasMemRefs, "MemRef objects initialized on variables") .value("IncoreTileOps", IRProperty::IncoreTileOps, "InCore functions use tile ops (tile types, load/store)") @@ -69,9 +68,7 @@ void BindPass(nb::module_& m) { "No BreakStmt/ContinueStmt — only structured control flow") .value("VectorKernelSplit", IRProperty::VectorKernelSplit, "AIV functions with split mode have tpop shapes and store offsets adjusted") - .value("OutParamNotShadowed", IRProperty::OutParamNotShadowed, "Out/InOut params are not reassigned") - .value("NoNestedInCore", IRProperty::NoNestedInCore, - "No nested InCore scopes (ScopeStmt inside ScopeStmt)"); + .value("OutParamNotShadowed", IRProperty::OutParamNotShadowed, "Out/InOut params are not reassigned"); // Bind IRPropertySet nb::class_(passes, "IRPropertySet", "A set of IR properties") @@ -318,10 +315,6 @@ void BindPass(nb::module_& m) { .value("USE_BEFORE_DEF", use_after_def::ErrorType::USE_BEFORE_DEF, "Variable used before any definition in scope"); - passes.def("split_chunked_loops", &pass::SplitChunkedLoops, - "Create a pass that splits chunked loops into nested loops"); - passes.def("interchange_chunk_loops", &pass::InterchangeChunkLoops, - "Create a pass that interchanges chunk loops and inserts InCore scopes"); passes.def("unroll_loops", &pass::UnrollLoops, "Create a loop unrolling pass"); passes.def("partial_unroll_tile_loops", &pass::PartialUnrollTileLoops, "Lower ``pl.range(N, unroll=F)`` loops at the tile level: replicate the body F\n" @@ -334,13 +327,15 @@ void BindPass(nb::module_& m) { passes.def("ctrl_flow_transform", &pass::CtrlFlowTransform, "Create a control flow structuring pass (eliminate break/continue)"); passes.def("convert_to_ssa", &pass::ConvertToSSA, "Create an SSA conversion pass"); - passes.def("outline_incore_scopes", &pass::OutlineIncoreScopes, - "Create a pass that outlines InCore scopes into separate functions"); passes.def("outline_cluster_scopes", &pass::OutlineClusterScopes, "Create a pass that outlines Cluster scopes into Group functions " "and standalone Spmd scopes into Spmd functions"); passes.def("outline_hierarchy_scopes", &pass::OutlineHierarchyScopes, - "Create a pass that outlines Hierarchy scopes into separate level/role functions"); + "Create a pass that outlines non-CORE_GROUP Hierarchy scopes into separate Opaque " + "level/role functions. CORE_GROUP scopes are left for outline_incore_scopes."); + passes.def("outline_incore_scopes", &pass::OutlineIncoreScopes, + "Create a pass that outlines CORE_GROUP Hierarchy scopes into InCore functions " + "and promotes the parent function from Opaque to Orchestration"); passes.def("convert_tensor_to_tile_ops", &pass::ConvertTensorToTileOps, "Create a pass that converts tensor ops to tile ops in InCore functions"); passes.def("optimize_orch_tensors", &pass::OptimizeOrchTensors, diff --git a/python/pypto/ir/builder.py b/python/pypto/ir/builder.py index b46b4e245..6e4981201 100644 --- a/python/pypto/ir/builder.py +++ b/python/pypto/ir/builder.py @@ -266,11 +266,11 @@ def scope( """Context manager for building scope statements. Args: - scope_kind: The kind of scope (e.g., ir.ScopeKind.InCore) + scope_kind: The kind of scope (e.g., ir.ScopeKind.Hierarchy) span: Optional explicit span. If None, automatically captured. level: Hierarchy level (for ScopeKind.Hierarchy) role: Function role (for ScopeKind.Hierarchy) - split: Split mode for cross-core transfer (for AutoInCore scopes) + split: Split mode for cross-core transfer (for ScopeKind.Hierarchy at CORE_GROUP) name_hint: User-provided scope name hint (empty = auto-generate) core_num: SPMD block count (for ScopeKind.Spmd scopes) sync_start: Require sync-start for SPMD dispatch (for ScopeKind.Spmd scopes) @@ -279,8 +279,8 @@ def scope( ScopeBuilder: Helper object for building the scope statement Example: - >>> with ib.scope(ir.ScopeKind.InCore) as scope_builder: - ... # InCore scope body + >>> with ib.scope(ir.ScopeKind.Hierarchy, level=ir.Level.CORE_GROUP) as scope_builder: + ... # CORE_GROUP scope body (outlined to Function(InCore)) ... ib.assign(y, add_expr) """ begin_span = span if span is not None else self._capture_call_span() diff --git a/python/pypto/ir/pass_manager.py b/python/pypto/ir/pass_manager.py index 3de50d5ad..cd3d7da22 100644 --- a/python/pypto/ir/pass_manager.py +++ b/python/pypto/ir/pass_manager.py @@ -124,8 +124,6 @@ def _register_passes(cls): ("FlattenCallExpr", lambda: passes.flatten_call_expr()), ] tensor_only_passes: list[PassSpec] = [ - ("SplitChunkedLoops", lambda: passes.split_chunked_loops()), - ("InterchangeChunkLoops", lambda: passes.interchange_chunk_loops()), ("OutlineHierarchyScopes", lambda: passes.outline_hierarchy_scopes()), ("OutlineIncoreScopes", lambda: passes.outline_incore_scopes()), ("OutlineClusterScopes", lambda: passes.outline_cluster_scopes()), diff --git a/python/pypto/language/__init__.py b/python/pypto/language/__init__.py index b2c77df7c..1a12d42de 100644 --- a/python/pypto/language/__init__.py +++ b/python/pypto/language/__init__.py @@ -59,12 +59,9 @@ def scalar_func(x: pl.Scalar[pl.FP32]) -> pl.Scalar[pl.FP32]: from . import optimizations, parser from .dsl_api import ( at, - auto_incore, - chunked_loop_optimizer, cluster, cond, const, - incore, parallel, range, spmd, @@ -169,7 +166,7 @@ def scalar_func(x: pl.Scalar[pl.FP32]) -> pl.Scalar[pl.FP32]: transpose, write, ) -from .optimizations import auto_chunk, split +from .optimizations import split from .parser.decorator import InlineFunction, function, inline, program from .parser.text_parser import loads, loads_program, parse, parse_program from .typing import DynVar, InOut, IntLike, MemRef, Out, Scalar, Tensor, Tile, Tuple, dynamic @@ -236,14 +233,10 @@ def scalar_func(x: pl.Scalar[pl.FP32]) -> pl.Scalar[pl.FP32]: "static_print", "static_assert", "at", - "incore", - "auto_incore", "cluster", "spmd", - "chunked_loop_optimizer", "optimizations", "split", - "auto_chunk", "tile", "system", "tensor", diff --git a/python/pypto/language/dsl_api.py b/python/pypto/language/dsl_api.py index af019d295..6cd9895a1 100644 --- a/python/pypto/language/dsl_api.py +++ b/python/pypto/language/dsl_api.py @@ -17,56 +17,9 @@ from pypto.language.typing import Scalar, Tensor, Tile from pypto.pypto_core import ir -from pypto.pypto_core.ir import SplitMode from .optimizations import Optimization - -class _ChunkedLoopOptimizerCall: - """Result of calling chunked_loop_optimizer(split=...). - - Stores the split mode to pass to the AutoInCore scope. - """ - - def __init__(self, split: SplitMode = SplitMode.UP_DOWN) -> None: - self.split = split - - def __repr__(self) -> str: - return f"chunked_loop_optimizer(split={self.split!r})" - - -class _ChunkedLoopOptimizer: - """Sentinel type for optimization=pl.chunked_loop_optimizer in pl.at(). - - Can be used bare or called with a split mode: - - ``optimization=pl.chunked_loop_optimizer`` - - ``optimization=pl.chunked_loop_optimizer(split=pl.SplitMode.UP_DOWN)`` - """ - - def __call__(self, *, split: SplitMode = SplitMode.UP_DOWN) -> _ChunkedLoopOptimizerCall: - """Create an optimizer specification with an explicit split mode. - - Args: - split: Split mode for cross-core data transfer (default: SplitMode.UP_DOWN) - - Returns: - Optimizer call with the given split mode - """ - return _ChunkedLoopOptimizerCall(split=split) - - def __repr__(self) -> str: - return "chunked_loop_optimizer" - - -chunked_loop_optimizer: _ChunkedLoopOptimizer = _ChunkedLoopOptimizer() -"""Sentinel for optimization=pl.chunked_loop_optimizer in pl.at(). - -Use with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer) -to request compiler-driven chunked loop outlining (replaces pl.auto_incore()). -Can also be called with a split mode: -pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer(split=pl.SplitMode.UP_DOWN)) -""" - # Range argument type: int literal or Scalar variable RangeArg = Union[int, "Scalar"] @@ -636,96 +589,6 @@ def static_assert(condition: Any, msg: str = "") -> None: """ -class IncoreContext: - """Context manager for InCore scope. - - This is returned by pl.incore() and used with the 'with' statement. - The parser recognizes this pattern and creates a ScopeStmt(InCore). - """ - - def __init__(self, split: SplitMode = SplitMode.NONE, name_hint: str = "") -> None: - self.split = split - self.name_hint = name_hint - - def __enter__(self) -> None: - """Enter the InCore scope context.""" - pass - - def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: - """Exit the InCore scope context.""" - pass - - -class AutoIncoreContext: - """Context manager for AutoInCore scope. - - This is returned by pl.auto_incore() and used with the 'with' statement. - The parser recognizes this pattern and creates a ScopeStmt(AutoInCore). - """ - - def __init__(self, split: SplitMode = SplitMode.NONE, name_hint: str = "") -> None: - self.split = split - self.name_hint = name_hint - - def __enter__(self) -> None: - """Enter the AutoInCore scope context.""" - pass - - def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: - """Exit the AutoInCore scope context.""" - pass - - -def auto_incore(split: SplitMode = SplitMode.UP_DOWN, *, name_hint: str = "") -> AutoIncoreContext: - """Mark a region of code for automatic incore chunking. - - This function returns a context manager that should be used with the 'with' statement. - The parser recognizes this pattern and creates a ScopeStmt with ScopeKind.AutoInCore. - - Args: - split: Split mode for cross-core data transfer (default: SplitMode.UP_DOWN) - - Returns: - Context manager for AutoInCore scope - - Examples: - >>> with pl.auto_incore(): - ... for i in pl.parallel(0, 8, 1, chunk=4): - ... x = pl.add(x, x) - >>> with pl.auto_incore(split=pl.SplitMode.UP_DOWN): - ... for i in pl.parallel(0, 8, 1, chunk=4): - ... x = pl.add(x, x) - """ - if split == SplitMode.NONE: - raise ValueError("SplitMode.NONE is not supported by pto-isa now") - return AutoIncoreContext(split=split, name_hint=name_hint) - - -def incore(split: SplitMode = SplitMode.NONE, *, name_hint: str = "") -> IncoreContext: - """Mark a region of code as belonging to the InCore execution context. - - This function returns a context manager that should be used with the 'with' statement. - The parser recognizes this pattern and creates a ScopeStmt with ScopeKind.InCore. - - Args: - split: Split mode for cross-core data transfer (default: SplitMode.NONE). - When set, the outlined InCore function will use the specified split - mode for data transfer between AIC and AIV cores. - name_hint: Optional name hint for the outlined function (must be a valid identifier) - - Returns: - Context manager for InCore scope - - Examples: - >>> with pl.incore(): - ... y = pl.ops.add(x, x) - ... z = pl.ops.mul(y, y) - >>> with pl.incore(split=pl.SplitMode.UP_DOWN): - ... y = pl.ops.add(x, x) - """ - return IncoreContext(split=split, name_hint=name_hint) - - class ClusterContext: """Context manager for Cluster scope. @@ -760,7 +623,7 @@ def cluster(*, name_hint: str = "") -> ClusterContext: Examples: >>> with pl.cluster(): - ... with pl.incore(): + ... with pl.at(level=pl.Level.CORE_GROUP): ... y = pl.add(x, x) """ return ClusterContext(name_hint=name_hint) @@ -827,11 +690,11 @@ class AtContext: """Context manager for hierarchy-level scope. Returned by pl.at(level=..., role=..., optimizations=[...]) and used with the - 'with' statement. The parser recognizes this pattern and creates: - - ScopeStmt(InCore) when level=CORE_GROUP (no optimizations) - - ScopeStmt(InCore, split=...) when level=CORE_GROUP with optimizations=[pl.split(...)] - - ScopeStmt(AutoInCore) when level=CORE_GROUP with optimizations=[pl.auto_chunk] - - ScopeStmt(Hierarchy) for all other levels + 'with' statement. The parser emits a HierarchyScopeStmt with: + - level = the given level (required) + - role = the optional role + - split = the SplitMode from optimizations=[pl.split(mode)] (only valid at + Level.CORE_GROUP) """ def __init__( @@ -840,16 +703,11 @@ def __init__( role: ir.Role | None = None, *, optimizations: list[Optimization] | None = None, - # Deprecated kwargs (kept for back-compat; emit DeprecationWarning at parse time): - optimization: _ChunkedLoopOptimizer | _ChunkedLoopOptimizerCall | None = None, - split: SplitMode | None = None, name_hint: str = "", ) -> None: self.level = level self.role = role self.optimizations = optimizations - self.optimization = optimization - self.split = split self.name_hint = name_hint def __enter__(self) -> None: @@ -864,62 +722,39 @@ def at( role: ir.Role | None = None, *, optimizations: list[Optimization] | None = None, - # Deprecated kwargs (kept for back-compat; emit DeprecationWarning at parse time): - optimization: _ChunkedLoopOptimizer | _ChunkedLoopOptimizerCall | None = None, - split: SplitMode | None = None, name_hint: str = "", ) -> AtContext: """Mark a region of code for execution at a specific hierarchy level. - With ``level=pl.Level.CORE_GROUP``, the ``optimizations=`` list controls - the resulting scope kind: - - - no entries → ``ScopeStmt(InCore)`` - - ``pl.split(mode)`` → ``ScopeStmt(InCore, split=mode)`` - - ``pl.auto_chunk`` → ``ScopeStmt(AutoInCore)`` - - both entries → ``ScopeStmt(AutoInCore, split=mode)`` - - For all other levels, this creates a Hierarchy scope. + At ``level=pl.Level.CORE_GROUP`` the optimizations list may contain + ``pl.split(mode)`` to request a cross-core data-transfer split mode on the + outlined InCore function. Args: level: Target hierarchy level (e.g. pl.Level.HOST, pl.Level.CORE_GROUP). role: Function role (Orchestrator or Worker). Default: None. - optimizations: Optional list literal of optimization entries. Each - entry must be one of ``pl.auto_chunk`` or ``pl.split(mode)`` — - written inline at the call site, since the DSL parser inspects - the AST and does not accept dynamically built variables here. - Entries are independent and may be combined. - optimization: **Deprecated.** Use ``optimizations=[pl.auto_chunk]`` (or - ``optimizations=[pl.auto_chunk, pl.split(mode)]``) instead. - split: **Deprecated.** Use ``optimizations=[pl.split(mode)]`` instead. + Not supported with level=CORE_GROUP. + optimizations: Optional list literal of optimization entries. Currently + only ``pl.split(mode)`` is supported. Must be written inline at the + call site — the DSL parser inspects the AST and does not accept + dynamically built variables here. name_hint: Optional name hint for the outlined function (must be a valid identifier). Returns: - Context manager for the appropriate scope. + Context manager for a HierarchyScopeStmt. Examples: - >>> # InCore scope (replaces pl.incore()): + >>> # CORE_GROUP scope (outlined into Function(InCore)): >>> with pl.at(level=pl.Level.CORE_GROUP): ... y = pl.ops.add(x, x) - >>> # InCore scope with split hint: + >>> # CORE_GROUP scope with AIC/AIV split hint: >>> with pl.at(level=pl.Level.CORE_GROUP, ... optimizations=[pl.split(pl.SplitMode.UP_DOWN)]): ... y = pl.ops.add(x, x) - >>> # AutoInCore scope (replaces pl.auto_incore()): - >>> with pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.auto_chunk]): - ... for i in pl.parallel(0, 8, 1, chunk=4): - ... x = pl.add(x, x) - - >>> # AutoInCore + split hint (combined, independent entries): - >>> with pl.at(level=pl.Level.CORE_GROUP, - ... optimizations=[pl.auto_chunk, pl.split(pl.SplitMode.UP_DOWN)]): - ... for i in pl.parallel(0, 8, 1, chunk=4): - ... x = pl.add(x, x) - - >>> # Hierarchy scope (unchanged behavior): + >>> # Hierarchy scope with role (non-CORE_GROUP levels): >>> with pl.at(level=pl.Level.HOST, role=pl.Role.Worker): ... y = pl.add(x, x) """ @@ -927,8 +762,6 @@ def at( level, role, optimizations=optimizations, - optimization=optimization, - split=split, name_hint=name_hint, ) @@ -943,16 +776,11 @@ def at( "cond", "static_print", "static_assert", - "incore", - "auto_incore", "at", "cluster", "spmd", - "chunked_loop_optimizer", "RangeIterator", "WhileIterator", - "IncoreContext", - "AutoIncoreContext", "ClusterContext", "SpmdContext", "AtContext", diff --git a/python/pypto/language/optimizations.py b/python/pypto/language/optimizations.py index 3dea0abb7..f085d7426 100644 --- a/python/pypto/language/optimizations.py +++ b/python/pypto/language/optimizations.py @@ -10,22 +10,11 @@ """Optimization config entries for ``pl.at(..., optimizations=[...])``. Each entry is an orthogonal optimization hint applied to the enclosing scope. -The entries can be combined freely in the ``optimizations=`` list. Available entries: - ``pl.split(mode)`` — Cross-core data-transfer split hint, consumed by - the ``ExpandMixedKernel`` pass. Lowers the scope to ``InCore`` with - ``split_=mode``. - - ``pl.auto_chunk`` — Request compiler-driven outlining of chunked - parallel loops. Lowers the scope to ``AutoInCore`` so that the - ``InterchangeChunkLoops`` pass can interchange and outline chunked - loops within it. - -These two entries are independent and may be combined:: - - with pl.at(level=pl.Level.CORE_GROUP, - optimizations=[pl.auto_chunk, pl.split(pl.SplitMode.UP_DOWN)]): - ... + the ``ExpandMixedKernel`` pass. Only valid at ``Level::CORE_GROUP``; + sets ``split`` on the enclosing ``HierarchyScopeStmt``. """ from __future__ import annotations @@ -43,13 +32,9 @@ class Optimization: class Split(Optimization): """Cross-core data-transfer split hint. - Sets ``ScopeStmt::split_`` on the enclosing ``pl.at`` scope; that metadata - is consumed by the ``ExpandMixedKernel`` pass via the outlined function's - ``SplitMode``. The split hint is independent of the resulting scope kind: - - - ``optimizations=[pl.split(mode)]`` → ``ScopeKind::InCore`` (split metadata). - - ``optimizations=[pl.auto_chunk, pl.split(mode)]`` → ``ScopeKind::AutoInCore`` - (split metadata still attached). + Sets ``HierarchyScopeStmt::split_`` on the enclosing ``pl.at`` scope. + Only valid at ``Level::CORE_GROUP``; consumed by the ``ExpandMixedKernel`` + pass via the outlined function's ``SplitMode``. Args: mode: Split mode (``SplitMode.UP_DOWN`` or ``SplitMode.LEFT_RIGHT``). @@ -60,18 +45,6 @@ class Split(Optimization): mode: SplitMode -@dataclass(frozen=True) -class AutoChunk(Optimization): - """Request compiler-driven outlining of chunked parallel loops. - - Lowers the enclosing ``pl.at`` scope to ``ScopeKind::AutoInCore`` so the - ``InterchangeChunkLoops`` pass can interchange chunked parallel loops - and outline the inner sequential portion into ``InCore`` scopes. - - Only valid with ``level=pl.Level.CORE_GROUP``. - """ - - def split(mode: SplitMode) -> Split: """Create a ``Split`` optimization entry. @@ -93,17 +66,8 @@ def split(mode: SplitMode) -> Split: return Split(mode=mode) -auto_chunk: AutoChunk = AutoChunk() -"""Sentinel for the ``AutoChunk`` optimization. - -Use as ``pl.auto_chunk`` in ``pl.at(..., optimizations=[pl.auto_chunk, ...])``. -""" - - __all__ = [ "Optimization", "Split", - "AutoChunk", "split", - "auto_chunk", ] diff --git a/python/pypto/language/parser/ast_parser.py b/python/pypto/language/parser/ast_parser.py index b67df36e3..8a4fa6ba2 100644 --- a/python/pypto/language/parser/ast_parser.py +++ b/python/pypto/language/parser/ast_parser.py @@ -184,14 +184,9 @@ class _AtKwargState: level: "ir.Level | None" = None role: "ir.Role | None" = None name_hint: str = "" - requests_auto_chunk: bool = False split_mode: "ir.SplitMode | None" = None - # Tracks which kwarg produced the AutoChunk / split state so the validation - # step can reject mixing the new `optimizations=` list with the deprecated - # `optimization=`/`split=` kwargs and emit DeprecationWarning at the end. + # Tracks whether optimizations= was already consumed so we can reject duplicates. new_optimizations_kw: "ast.keyword | None" = field(default=None) - legacy_optimization_kw: "ast.keyword | None" = field(default=None) - legacy_split_kw: "ast.keyword | None" = field(default=None) class ASTParser: @@ -1302,14 +1297,6 @@ def parse_for_loop(self, stmt: ast.For) -> None: # noqa: PLR0912 def _validate_chunk_args(self, chunk_expr: Any, init_values: list[Any], iter_call: ast.Call) -> None: """Validate chunk arguments for range/parallel/unroll loops.""" - if not self._is_inside_scope(ir.ScopeKind.AutoInCore): - raise ParserSyntaxError( - "chunk=... loops are only valid inside " - "with pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.auto_chunk]):", - span=self.span_tracker.get_span(iter_call), - hint="Wrap the loop in 'with pl.at(level=pl.Level.CORE_GROUP, " - "optimizations=[pl.auto_chunk]):' or remove the chunk= argument.", - ) if not _is_const_int(chunk_expr): raise ParserSyntaxError( "chunk must be a compile-time constant positive integer", @@ -1927,21 +1914,15 @@ def parse_if_statement(self, stmt: ast.If) -> None: self.in_if_stmt = False self.current_if_builder = None - def _parse_at_kwargs( - self, call: ast.Call - ) -> tuple[ir.Level, ir.Role | None, bool, ir.SplitMode | None, str]: - """Extract level, role, AutoChunk request, split mode, and name from pl.at(...) call. + def _parse_at_kwargs(self, call: ast.Call) -> tuple[ir.Level, ir.Role | None, ir.SplitMode | None, str]: + """Extract level, role, split mode, and name from pl.at(...) call. - Supports both positional and keyword forms. Preferred new API uses the - ``optimizations=[...]`` list with ``pl.split(...)`` and ``pl.auto_chunk`` - entries. The legacy ``optimization=`` and top-level ``split=`` kwargs - are still accepted but emit a DeprecationWarning. Mixing the new - ``optimizations=`` list with either deprecated kwarg is a hard error. + Supports both positional and keyword forms. The optimizations=[...] list + can contain ``pl.split(MODE)`` entries to request a cross-core split + (valid only at ``Level.CORE_GROUP``). Returns: - Tuple of (level, role, requests_auto_chunk, split_mode, name_hint). - ``requests_auto_chunk`` is True when the resulting scope must be - ``AutoInCore`` rather than ``InCore``. + Tuple of (level, role, split_mode, name_hint). """ if len(call.args) > 2: raise ParserSyntaxError( @@ -1964,8 +1945,7 @@ def _parse_at_kwargs( hint="Use pl.at(pl.Level.HOST) or pl.at(level=pl.Level.HOST)", ) - self._validate_at_kwarg_combinations(state) - return state.level, state.role, state.requests_auto_chunk, state.split_mode, state.name_hint + return state.level, state.role, state.split_mode, state.name_hint def _dispatch_at_keyword(self, kw: ast.keyword, state: "_AtKwargState") -> None: """Dispatch a single pl.at() keyword argument and update state.""" @@ -1978,11 +1958,13 @@ def _dispatch_at_keyword(self, kw: ast.keyword, state: "_AtKwargState") -> None: raise ParserSyntaxError("pl.at() got multiple values for argument 'role'") state.role = extract_enum_value(kw.value, ROLE_MAP, "Role", "pl.Role") elif kw.arg == "optimizations": - self._handle_at_optimizations_kw(kw, state) - elif kw.arg == "optimization": - self._handle_at_legacy_optimization_kw(kw, state) - elif kw.arg == "split": - self._handle_at_legacy_split_kw(kw, state) + if state.new_optimizations_kw is not None: + raise ParserSyntaxError( + "pl.at() got multiple values for argument 'optimizations'", + span=self.span_tracker.get_span(kw), + ) + state.new_optimizations_kw = kw + state.split_mode = self._parse_optimizations_list(kw.value) elif kw.arg == "name_hint": state.name_hint = self._parse_scope_name_hint(kw.value, "pl.at()") elif kw.arg is None: @@ -1996,113 +1978,28 @@ def _dispatch_at_keyword(self, kw: ast.keyword, state: "_AtKwargState") -> None: hint="Supported arguments: level, role, optimizations, name_hint", ) - def _handle_at_optimizations_kw(self, kw: ast.keyword, state: "_AtKwargState") -> None: - if state.new_optimizations_kw is not None: - raise ParserSyntaxError( - "pl.at() got multiple values for argument 'optimizations'", - span=self.span_tracker.get_span(kw), - ) - state.new_optimizations_kw = kw - state.requests_auto_chunk, state.split_mode = self._parse_optimizations_list(kw.value) - - def _handle_at_legacy_optimization_kw(self, kw: ast.keyword, state: "_AtKwargState") -> None: - if state.legacy_optimization_kw is not None: - raise ParserSyntaxError( - "pl.at() got multiple values for argument 'optimization'", - span=self.span_tracker.get_span(kw), - ) - state.legacy_optimization_kw = kw - # Bare or called legacy optimizer always implies AutoChunk. - state.requests_auto_chunk = True - state.split_mode = self._parse_chunked_loop_optimizer(kw.value) - - def _handle_at_legacy_split_kw(self, kw: ast.keyword, state: "_AtKwargState") -> None: - if state.legacy_split_kw is not None: - raise ParserSyntaxError( - "pl.at() got multiple values for argument 'split'", - span=self.span_tracker.get_span(kw), - ) - state.legacy_split_kw = kw - state.split_mode = self._eval_split_mode(kw.value) - - def _validate_at_kwarg_combinations(self, state: "_AtKwargState") -> None: - """Reject illegal kwarg combinations and emit DeprecationWarnings.""" - # Hard error when mixing new optimizations= with deprecated kwargs. - if state.new_optimizations_kw is not None and ( - state.legacy_optimization_kw is not None or state.legacy_split_kw is not None - ): - offending = state.legacy_optimization_kw or state.legacy_split_kw - assert offending is not None - raise ParserSyntaxError( - "Cannot mix 'optimizations=' with deprecated 'optimization=' or 'split=' kwargs in pl.at()", - span=self.span_tracker.get_span(offending), - hint="Use only optimizations=[pl.split(...), pl.auto_chunk] — drop the deprecated kwargs.", - ) - - # Preserve the pre-existing rule that the two deprecated kwargs cannot be - # combined: legacy `optimization=` always implied AutoInCore + a baked-in - # split, so combining it with legacy top-level `split=` was ambiguous. - if state.legacy_optimization_kw is not None and state.legacy_split_kw is not None: - raise ParserSyntaxError( - "Cannot use both 'optimization' and 'split' in pl.at()", - span=self.span_tracker.get_span(state.legacy_split_kw), - hint="Use optimizations=[pl.auto_chunk, pl.split(...)] for AutoInCore + " - "split, or optimizations=[pl.split(...)] for plain InCore + split.", - ) - - # Emit deprecation warnings for legacy kwargs (after mixing checks, so the - # user sees the structural error first if both apply). - if state.legacy_optimization_kw is not None: - warnings.warn( - "pl.at(optimization=pl.chunked_loop_optimizer[(...)]) is deprecated; " - "use pl.at(optimizations=[pl.auto_chunk]) — combine with pl.split(...) " - "if a split mode is needed.", - DeprecationWarning, - stacklevel=2, - ) - if state.legacy_split_kw is not None: - warnings.warn( - "pl.at(split=...) is deprecated; use pl.at(optimizations=[pl.split(...)]).", - DeprecationWarning, - stacklevel=2, - ) - - def _parse_optimizations_list(self, value: ast.expr) -> tuple[bool, "ir.SplitMode | None"]: + def _parse_optimizations_list(self, value: ast.expr) -> "ir.SplitMode | None": """Parse pl.at(..., optimizations=[...]) AST node. - Each entry must be one of: - - - ``pl.auto_chunk`` — request AutoInCore semantics. - - ``pl.split(MODE)`` — set the cross-core split mode. - - Both fully qualified forms (``pl.optimizations.auto_chunk``, - ``pl.optimizations.split(MODE)``) are also accepted. + Each entry must be ``pl.split(MODE)`` — the fully qualified form + ``pl.optimizations.split(MODE)`` is also accepted. Returns: - Tuple ``(requests_auto_chunk, split_mode)``. + The requested split mode, or ``None`` if no ``pl.split(...)`` entry + was provided. """ if not isinstance(value, ast.List): raise ParserSyntaxError( "pl.at(optimizations=...) must be a list literal", span=self.span_tracker.get_span(value), - hint="Use optimizations=[pl.split(pl.SplitMode.UP_DOWN)] or optimizations=[pl.auto_chunk].", + hint="Use optimizations=[pl.split(pl.SplitMode.UP_DOWN)].", ) - requests_auto_chunk = False split_mode: ir.SplitMode | None = None - seen_auto_chunk = False seen_split = False for entry in value.elts: - if self._is_pl_auto_chunk(entry): - if seen_auto_chunk: - raise ParserSyntaxError( - "Duplicate 'pl.auto_chunk' in optimizations=[...]", - span=self.span_tracker.get_span(entry), - ) - seen_auto_chunk = True - requests_auto_chunk = True - elif (mode := self._try_parse_pl_split(entry)) is not None: + if (mode := self._try_parse_pl_split(entry)) is not None: if seen_split: raise ParserSyntaxError( "Duplicate 'pl.split(...)' in optimizations=[...]", @@ -2114,28 +2011,10 @@ def _parse_optimizations_list(self, value: ast.expr) -> tuple[bool, "ir.SplitMod raise ParserSyntaxError( "Unsupported entry in pl.at(optimizations=[...])", span=self.span_tracker.get_span(entry), - hint="Each entry must be pl.auto_chunk or pl.split(pl.SplitMode.X).", + hint="Each entry must be pl.split(pl.SplitMode.X).", ) - return requests_auto_chunk, split_mode - - @staticmethod - def _is_pl_auto_chunk(node: ast.expr) -> bool: - """Return True if the AST node is ``pl.auto_chunk`` or ``pl.optimizations.auto_chunk``.""" - if not isinstance(node, ast.Attribute) or node.attr != "auto_chunk": - return False - # pl.auto_chunk - if isinstance(node.value, ast.Name) and node.value.id == "pl": - return True - # pl.optimizations.auto_chunk - if ( - isinstance(node.value, ast.Attribute) - and node.value.attr == "optimizations" - and isinstance(node.value.value, ast.Name) - and node.value.value.id == "pl" - ): - return True - return False + return split_mode def _try_parse_pl_split(self, node: ast.expr) -> "ir.SplitMode | None": """Return the SplitMode if the AST node is ``pl.split(MODE)``; else None. @@ -2183,65 +2062,6 @@ def _try_parse_pl_split(self, node: ast.expr) -> "ir.SplitMode | None": ) return mode - def _parse_chunked_loop_optimizer(self, value: ast.expr) -> "ir.SplitMode": - """Parse pl.chunked_loop_optimizer or pl.chunked_loop_optimizer(split=...) AST node. - - Returns the split mode to use for the AutoInCore scope. - """ - # Bare: pl.chunked_loop_optimizer - if ( - isinstance(value, ast.Attribute) - and value.attr == "chunked_loop_optimizer" - and isinstance(value.value, ast.Name) - and value.value.id == "pl" - ): - return ir.SplitMode.UP_DOWN - - # Called: pl.chunked_loop_optimizer(split=pl.SplitMode.UP_DOWN) - if ( - isinstance(value, ast.Call) - and isinstance(value.func, ast.Attribute) - and value.func.attr == "chunked_loop_optimizer" - and isinstance(value.func.value, ast.Name) - and value.func.value.id == "pl" - ): - if value.args: - raise ParserSyntaxError( - "pl.chunked_loop_optimizer() does not accept positional arguments", - span=self.span_tracker.get_span(value), - hint="Use: pl.chunked_loop_optimizer(split=pl.SplitMode.UP_DOWN)", - ) - split = ir.SplitMode.UP_DOWN - for opt_kw in value.keywords: - if opt_kw.arg == "split": - split = extract_enum_value(opt_kw.value, SPLIT_MODE_MAP, "SplitMode", "pl.SplitMode") - if split == ir.SplitMode.NONE: - raise ParserSyntaxError( - "pl.chunked_loop_optimizer() does not support split=pl.SplitMode.NONE", - span=self.span_tracker.get_span(opt_kw.value), - hint="Use pl.SplitMode.UP_DOWN or pl.SplitMode.LEFT_RIGHT", - ) - else: - raise ParserSyntaxError( - f"pl.chunked_loop_optimizer() got unexpected keyword '{opt_kw.arg}'", - span=self.span_tracker.get_span(opt_kw), - hint="Only 'split' is supported: " - "pl.chunked_loop_optimizer(split=pl.SplitMode.UP_DOWN)", - ) - return split - - raise ParserSyntaxError( - "optimization= only accepts pl.chunked_loop_optimizer or " - "pl.chunked_loop_optimizer(split=pl.SplitMode.UP_DOWN)", - span=self.span_tracker.get_span(value), - hint="Use optimization=pl.chunked_loop_optimizer or " - "optimization=pl.chunked_loop_optimizer(split=pl.SplitMode.UP_DOWN)", - ) - - def _eval_split_mode(self, value: ast.expr) -> "ir.SplitMode": - """Extract SplitMode enum value from AST expression.""" - return extract_enum_value(value, SPLIT_MODE_MAP, "SplitMode", "pl.SplitMode") - def _parse_scope_name_hint(self, value: ast.expr, func_name: str) -> str: """Extract and validate a scope name hint from an AST expression. @@ -2274,43 +2094,9 @@ def _parse_legacy_scope( func_attr: str, scope_kind_map: dict[str, "ir.ScopeKind"], ) -> None: - """Parse legacy scope context managers (pl.incore, pl.auto_incore, pl.cluster).""" - split_mode = None + """Parse legacy scope context managers (pl.cluster, pl.spmd).""" name_hint = "" - if func_attr in ("auto_incore", "incore"): - if context_expr.args: - raise ParserSyntaxError( - f"pl.{func_attr}() does not accept positional arguments", - span=self.span_tracker.get_span(stmt), - hint=f"Use 'with pl.{func_attr}(split=pl.SplitMode.UP_DOWN):'", - ) - for kw in context_expr.keywords: - if kw.arg == "split": - split_mode = self._eval_split_mode(kw.value) - elif kw.arg == "name_hint": - name_hint = self._parse_scope_name_hint(kw.value, f"pl.{func_attr}()") - else: - raise ParserSyntaxError( - f"pl.{func_attr}() got unexpected keyword argument '{kw.arg}'", - span=self.span_tracker.get_span(stmt), - hint="Supported keywords: 'split', 'name_hint'", - ) - if func_attr == "incore": - warnings.warn( - "pl.incore() is deprecated; use 'with pl.at(level=pl.Level.CORE_GROUP):' " - "(optionally with optimizations=[pl.split(pl.SplitMode.X)]) instead", - DeprecationWarning, - stacklevel=2, - ) - else: - warnings.warn( - "pl.auto_incore() is deprecated; use " - "'with pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.auto_chunk]):' " - "(combine with pl.split(pl.SplitMode.X) if a split mode is needed) instead", - DeprecationWarning, - stacklevel=2, - ) - elif func_attr == "cluster": + if func_attr == "cluster": if context_expr.args: raise ParserSyntaxError( f"pl.{func_attr}() does not accept positional arguments", @@ -2330,18 +2116,14 @@ def _parse_legacy_scope( span = self.span_tracker.get_span(stmt) self._parse_scope_body(stmt, scope_kind, span, name_hint=name_hint) return - elif func_attr == "spmd": + if func_attr == "spmd": self._parse_spmd_scope(stmt, context_expr, scope_kind_map) return - elif context_expr.args or context_expr.keywords: - raise ParserSyntaxError( - f"pl.{func_attr}() does not accept arguments", - span=self.span_tracker.get_span(stmt), - hint=f"Use 'with pl.{func_attr}():' without arguments", - ) - scope_kind = scope_kind_map[func_attr] - span = self.span_tracker.get_span(stmt) - self._parse_scope_body(stmt, scope_kind, span, split=split_mode, name_hint=name_hint) + raise ParserSyntaxError( + f"Unsupported scope context manager 'pl.{func_attr}()'", + span=self.span_tracker.get_span(stmt), + hint="Supported: pl.cluster(), pl.spmd(...), pl.at(level=...)", + ) def _parse_spmd_scope( self, @@ -2465,26 +2247,15 @@ def _parse_scope_body( self.scope_manager.exit_scope(leak_vars=True) def _parse_at_scope(self, stmt: ast.With, context_expr: ast.Call) -> None: - """Parse pl.at(...) context manager into a ScopeStmt.""" - level, role, requests_auto_chunk, split_mode, name_hint = self._parse_at_kwargs(context_expr) + """Parse pl.at(...) context manager into a HierarchyScopeStmt.""" + level, role, split_mode, name_hint = self._parse_at_kwargs(context_expr) span = self.span_tracker.get_span(stmt) is_core_group = level == ir.Level.CORE_GROUP - if requests_auto_chunk and not is_core_group: - raise ParserSyntaxError( - "auto-chunk optimization is only supported with level=pl.Level.CORE_GROUP " - "(via optimizations=[pl.auto_chunk] or the deprecated " - "optimization=pl.chunked_loop_optimizer)", - span=span, - hint="Use pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.auto_chunk]) " - "for an AutoInCore scope.", - ) - if split_mode is not None and not is_core_group: raise ParserSyntaxError( - "split mode is only supported with level=pl.Level.CORE_GROUP " - "(via optimizations=[pl.split(...)] or the deprecated split= kwarg)", + "split mode is only supported with level=pl.Level.CORE_GROUP", span=span, hint="Use pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.split(pl.SplitMode.UP_DOWN)]).", ) @@ -2493,33 +2264,29 @@ def _parse_at_scope(self, stmt: ast.With, context_expr: ast.Call) -> None: raise ParserSyntaxError( "role= is not supported with level=pl.Level.CORE_GROUP", span=span, - hint="Drop role= for InCore/AutoInCore scopes, " - "or use a non-CORE_GROUP level for Hierarchy scope", + hint="Drop role= for Level.CORE_GROUP scopes, " + "or use a non-CORE_GROUP level for a Hierarchy scope with a role", ) - if not is_core_group: - self._parse_scope_body( - stmt, ir.ScopeKind.Hierarchy, span, level=level, role=role, name_hint=name_hint - ) - elif requests_auto_chunk: - self._parse_scope_body(stmt, ir.ScopeKind.AutoInCore, span, split=split_mode, name_hint=name_hint) - else: - self._parse_scope_body(stmt, ir.ScopeKind.InCore, span, split=split_mode, name_hint=name_hint) + self._parse_scope_body( + stmt, + ir.ScopeKind.Hierarchy, + span, + level=level, + role=role, + split=split_mode, + name_hint=name_hint, + ) def parse_with_statement(self, stmt: ast.With) -> None: """Parse with statement for scope contexts. Currently supports: - - with pl.incore(): ... (deprecated; creates ScopeStmt with InCore scope) - - with pl.incore(split=pl.SplitMode.UP_DOWN): ... (deprecated; InCore with split) - - with pl.auto_incore(): ... (deprecated; creates ScopeStmt with AutoInCore scope) - - with pl.auto_incore(split=pl.SplitMode.UP_DOWN): ... (deprecated; with split mode) - - with pl.cluster(): ... (creates ScopeStmt with Cluster scope) - - with pl.at(level=..., role=...): ... (creates ScopeStmt with InCore/Hierarchy scope) - - with pl.at(level=CORE_GROUP): ... (creates ScopeStmt with InCore scope) - - with pl.at(level=CORE_GROUP, split=pl.SplitMode.UP_DOWN): ... (InCore with split) - - with pl.at(level=CORE_GROUP, optimization=pl.chunked_loop_optimizer): ... - (creates ScopeStmt with AutoInCore scope) + - with pl.cluster(): ... (creates ClusterScopeStmt) + - with pl.spmd(core_num=N): ... (creates SpmdScopeStmt) + - with pl.at(level=...): ... (creates HierarchyScopeStmt) + - with pl.at(level=CORE_GROUP, optimizations=[pl.split(...)]): ... + (creates HierarchyScopeStmt at CORE_GROUP with split mode) Args: stmt: With AST node @@ -2529,8 +2296,8 @@ def parse_with_statement(self, stmt: ast.With) -> None: raise ParserSyntaxError( "Only single context manager supported in with statement", span=self.span_tracker.get_span(stmt), - hint="Use 'with pl.incore():', 'with pl.auto_incore():'," - " 'with pl.cluster():', or 'with pl.at(level=...):'" + hint="Use 'with pl.cluster():', 'with pl.spmd(core_num=N):'," + " or 'with pl.at(level=...):'" " without multiple context managers", ) @@ -2539,8 +2306,6 @@ def parse_with_statement(self, stmt: ast.With) -> None: # Map DSL function names to ScopeKind values _SCOPE_KIND_MAP = { - "incore": ir.ScopeKind.InCore, - "auto_incore": ir.ScopeKind.AutoInCore, "cluster": ir.ScopeKind.Cluster, "spmd": ir.ScopeKind.Spmd, } @@ -2548,12 +2313,10 @@ def parse_with_statement(self, stmt: ast.With) -> None: if isinstance(context_expr, ast.Call): func = context_expr.func if isinstance(func, ast.Attribute) and isinstance(func.value, ast.Name) and func.value.id == "pl": - # Existing scope kinds: pl.incore(), pl.auto_incore(), pl.cluster() if func.attr in _SCOPE_KIND_MAP: self._parse_legacy_scope(stmt, context_expr, func.attr, _SCOPE_KIND_MAP) return - # pl.at(level=..., role=..., optimization=...) if func.attr == "at": self._parse_at_scope(stmt, context_expr) return @@ -2562,8 +2325,8 @@ def parse_with_statement(self, stmt: ast.With) -> None: raise UnsupportedFeatureError( "Unsupported context manager in with statement", span=self.span_tracker.get_span(stmt), - hint="Supported: 'with pl.incore():', 'with pl.auto_incore():'," - " 'with pl.cluster():', 'with pl.at(level=..., optimization=...):'", + hint="Supported: 'with pl.cluster():', 'with pl.spmd(core_num=N):'," + " 'with pl.at(level=..., optimizations=[...]):'", ) def parse_return(self, stmt: ast.Return) -> None: diff --git a/python/pypto/pypto_core/ir.pyi b/python/pypto/pypto_core/ir.pyi index a6c70b632..d75bd6e51 100644 --- a/python/pypto/pypto_core/ir.pyi +++ b/python/pypto/pypto_core/ir.pyi @@ -1771,19 +1771,13 @@ class WhileStmt(Stmt): class ScopeKind(enum.Enum): """Scope kind classification.""" - InCore = 0 - """InCore scope for AICore sub-graphs.""" - - AutoInCore = 1 - """AutoInCore scope for automatic chunking.""" - - Cluster = 2 + Cluster = 0 """Cluster scope for co-scheduled AIC + AIV groups.""" - Hierarchy = 3 - """Distributed hierarchy scope (uses level/role on ScopeStmt).""" + Hierarchy = 1 + """Distributed hierarchy scope (uses level/role/split on ScopeStmt).""" - Spmd = 4 + Spmd = 2 """SPMD dispatch scope (core_num/sync_start on ScopeStmt).""" class SplitMode(enum.Enum): @@ -1814,40 +1808,8 @@ class ScopeStmt(Stmt): """The nested statements.""" def __init__(self, *args: object, **kwargs: object) -> None: - """ScopeStmt is abstract — construct an InCoreScopeStmt, AutoInCoreScopeStmt, - ClusterScopeStmt, HierarchyScopeStmt, or SpmdScopeStmt instead.""" - -class InCoreScopeStmt(ScopeStmt): - """InCore scope: AICore sub-graph region.""" - - split: Final[SplitMode | None] - """Split mode for cross-core transfer (None or SplitMode.None for no split).""" - - def __init__( - self, - split: SplitMode | None = None, - name_hint: str = "", - *, - body: Stmt, - span: Span, - ) -> None: - """Create an InCore scope statement.""" - -class AutoInCoreScopeStmt(ScopeStmt): - """AutoInCore scope: InCore region with automatic chunking.""" - - split: Final[SplitMode | None] - """Split mode for cross-core transfer (None or SplitMode.None for no split).""" - - def __init__( - self, - split: SplitMode | None = None, - name_hint: str = "", - *, - body: Stmt, - span: Span, - ) -> None: - """Create an AutoInCore scope statement.""" + """ScopeStmt is abstract — construct a ClusterScopeStmt, HierarchyScopeStmt, + or SpmdScopeStmt instead.""" class ClusterScopeStmt(ScopeStmt): """Cluster scope: co-scheduled AIC + AIV group.""" @@ -1864,10 +1826,14 @@ class HierarchyScopeStmt(ScopeStmt): role: Final[Role | None] """Function role (Orchestrator or Worker; None for unspecified).""" + split: Final[SplitMode | None] + """AIC/AIV split mode (only valid at Level.CORE_GROUP).""" + def __init__( self, level: Level, role: Role | None = None, + split: SplitMode | None = None, name_hint: str = "", *, body: Stmt, @@ -2661,7 +2627,7 @@ class IRBuilder: """Begin building a scope statement. Args: - scope_kind: The kind of scope (e.g., ScopeKind.InCore) + scope_kind: The kind of scope (e.g., ScopeKind.Hierarchy) span: Source location for scope statement level: Hierarchy level (default: None) role: Hierarchy scope role (default: None) @@ -3191,8 +3157,6 @@ class IRVisitor: def visit_if_stmt(self, op: IfStmt) -> None: ... def visit_for_stmt(self, op: ForStmt) -> None: ... def visit_while_stmt(self, op: WhileStmt) -> None: ... - def visit_in_core_scope_stmt(self, op: InCoreScopeStmt) -> None: ... - def visit_auto_in_core_scope_stmt(self, op: AutoInCoreScopeStmt) -> None: ... def visit_cluster_scope_stmt(self, op: ClusterScopeStmt) -> None: ... def visit_hierarchy_scope_stmt(self, op: HierarchyScopeStmt) -> None: ... def visit_spmd_scope_stmt(self, op: SpmdScopeStmt) -> None: ... @@ -3268,8 +3232,6 @@ class IRMutator: def visit_if_stmt(self, op: IfStmt) -> Stmt: ... def visit_for_stmt(self, op: ForStmt) -> Stmt: ... def visit_while_stmt(self, op: WhileStmt) -> Stmt: ... - def visit_in_core_scope_stmt(self, op: InCoreScopeStmt) -> Stmt: ... - def visit_auto_in_core_scope_stmt(self, op: AutoInCoreScopeStmt) -> Stmt: ... def visit_cluster_scope_stmt(self, op: ClusterScopeStmt) -> Stmt: ... def visit_hierarchy_scope_stmt(self, op: HierarchyScopeStmt) -> Stmt: ... def visit_spmd_scope_stmt(self, op: SpmdScopeStmt) -> Stmt: ... diff --git a/python/pypto/pypto_core/passes.pyi b/python/pypto/pypto_core/passes.pyi index 2f5e6e93c..69500a5f9 100644 --- a/python/pypto/pypto_core/passes.pyi +++ b/python/pypto/pypto_core/passes.pyi @@ -23,7 +23,6 @@ class IRProperty(Enum): NoNestedCalls = ... NormalizedStmtStructure = ... NoRedundantBlocks = ... - SplitIncoreOrch = ... HasMemRefs = ... IncoreTileOps = ... AllocatedMemoryAddr = ... @@ -37,7 +36,6 @@ class IRProperty(Enum): StructuredCtrlFlow = ... VectorKernelSplit = ... OutParamNotShadowed = ... - NoNestedInCore = ... class IRPropertySet: """A set of IR properties backed by a bitset.""" @@ -316,12 +314,6 @@ class TypeCheckErrorType(Enum): FOR_RANGE_MUST_BE_SCALAR = ... CONDITION_MUST_BE_BOOL = ... -def split_chunked_loops() -> Pass: - """Create a pass that splits chunked loops into nested loops.""" - -def interchange_chunk_loops() -> Pass: - """Create a pass that interchanges chunk loops and inserts InCore scopes.""" - def unroll_loops() -> Pass: """Create a loop unrolling pass that expands ForKind.Unroll loops at compile time.""" @@ -348,14 +340,14 @@ def ctrl_flow_transform() -> Pass: def convert_to_ssa() -> Pass: """Create an SSA conversion pass.""" -def outline_incore_scopes() -> Pass: - """Create a pass that outlines InCore scopes.""" - def outline_cluster_scopes() -> Pass: """Create a pass that outlines Cluster scopes to Group and standalone Spmd scopes to Spmd.""" def outline_hierarchy_scopes() -> Pass: - """Create a pass that outlines Hierarchy scopes into level/role functions.""" + """Outline non-CORE_GROUP Hierarchy scopes into Opaque level/role functions.""" + +def outline_incore_scopes() -> Pass: + """Outline CORE_GROUP Hierarchy scopes into InCore functions; promote parent to Orchestration.""" def convert_tensor_to_tile_ops() -> Pass: """Create a pass that converts tensor ops to tile ops in InCore functions.""" @@ -492,14 +484,12 @@ __all__ = [ "VerificationError", "SSAErrorType", "TypeCheckErrorType", - "split_chunked_loops", - "interchange_chunk_loops", "unroll_loops", "ctrl_flow_transform", "convert_to_ssa", - "outline_incore_scopes", "outline_cluster_scopes", "outline_hierarchy_scopes", + "outline_incore_scopes", "convert_tensor_to_tile_ops", "optimize_orch_tensors", "flatten_tile_nd_to_2d", diff --git a/src/ir/builder.cpp b/src/ir/builder.cpp index da80e97d3..cc6a26e32 100644 --- a/src/ir/builder.cpp +++ b/src/ir/builder.cpp @@ -329,20 +329,13 @@ StmtPtr IRBuilder::EndScope(const Span& end_span) { // Dispatch on scope_kind to the matching derived class (issue #1047). ScopeStmtPtr scope_stmt; switch (scope_kind) { - case ScopeKind::InCore: - scope_stmt = std::make_shared(split, std::move(name_hint), body, combined_span); - break; - case ScopeKind::AutoInCore: - scope_stmt = - std::make_shared(split, std::move(name_hint), body, combined_span); - break; case ScopeKind::Cluster: scope_stmt = std::make_shared(std::move(name_hint), body, combined_span); break; case ScopeKind::Hierarchy: CHECK(level.has_value()) << "Hierarchy scope requires a level"; - scope_stmt = - std::make_shared(*level, role, std::move(name_hint), body, combined_span); + scope_stmt = std::make_shared(*level, role, split, std::move(name_hint), body, + combined_span); break; case ScopeKind::Spmd: CHECK(core_num.has_value()) << "Spmd scope requires core_num"; diff --git a/src/ir/serialization/serializer.cpp b/src/ir/serialization/serializer.cpp index 0584c40e9..62cfdc27e 100644 --- a/src/ir/serialization/serializer.cpp +++ b/src/ir/serialization/serializer.cpp @@ -219,8 +219,6 @@ class IRSerializer::Impl { SERIALIZE_FIELDS(ReturnStmt); SERIALIZE_FIELDS(ForStmt); SERIALIZE_FIELDS(WhileStmt); - SERIALIZE_FIELDS(InCoreScopeStmt); - SERIALIZE_FIELDS(AutoInCoreScopeStmt); SERIALIZE_FIELDS(ClusterScopeStmt); SERIALIZE_FIELDS(HierarchyScopeStmt); SERIALIZE_FIELDS(SpmdScopeStmt); diff --git a/src/ir/serialization/type_deserializers.cpp b/src/ir/serialization/type_deserializers.cpp index 00435d888..51f2c5233 100644 --- a/src/ir/serialization/type_deserializers.cpp +++ b/src/ir/serialization/type_deserializers.cpp @@ -592,28 +592,6 @@ static std::optional DeserializeScopeSplit(const msgpack::object& fie return split; } -// Deserialize InCoreScopeStmt -static IRNodePtr DeserializeInCoreScopeStmt(const msgpack::object& fields_obj, msgpack::zone& zone, - DeserializerContext& ctx) { - auto span = ctx.DeserializeSpan(GET_FIELD_OBJ("span")); - auto split = DeserializeScopeSplit(fields_obj, ctx); - auto name_hint = DeserializeScopeNameHint(fields_obj, ctx); - auto body = std::static_pointer_cast(ctx.DeserializeNode(GET_FIELD_OBJ("body"), zone)); - return std::make_shared(split, std::move(name_hint), body, span, - DeserializeLeadingComments(fields_obj)); -} - -// Deserialize AutoInCoreScopeStmt -static IRNodePtr DeserializeAutoInCoreScopeStmt(const msgpack::object& fields_obj, msgpack::zone& zone, - DeserializerContext& ctx) { - auto span = ctx.DeserializeSpan(GET_FIELD_OBJ("span")); - auto split = DeserializeScopeSplit(fields_obj, ctx); - auto name_hint = DeserializeScopeNameHint(fields_obj, ctx); - auto body = std::static_pointer_cast(ctx.DeserializeNode(GET_FIELD_OBJ("body"), zone)); - return std::make_shared(split, std::move(name_hint), body, span, - DeserializeLeadingComments(fields_obj)); -} - // Deserialize ClusterScopeStmt static IRNodePtr DeserializeClusterScopeStmt(const msgpack::object& fields_obj, msgpack::zone& zone, DeserializerContext& ctx) { @@ -641,9 +619,10 @@ static IRNodePtr DeserializeHierarchyScopeStmt(const msgpack::object& fields_obj role = static_cast(role_obj->via.u64); } + auto split = DeserializeScopeSplit(fields_obj, ctx); auto name_hint = DeserializeScopeNameHint(fields_obj, ctx); auto body = std::static_pointer_cast(ctx.DeserializeNode(GET_FIELD_OBJ("body"), zone)); - return std::make_shared(level, role, std::move(name_hint), body, span, + return std::make_shared(level, role, split, std::move(name_hint), body, span, DeserializeLeadingComments(fields_obj)); } @@ -689,10 +668,6 @@ static IRNodePtr DeserializeLegacyScopeStmt(const msgpack::object& fields_obj, m << "Legacy ScopeStmt scope_kind must be a string, got msgpack type " << static_cast(kind_obj.type); auto kind = StringToScopeKind(kind_obj.as()); switch (kind) { - case ScopeKind::InCore: - return DeserializeInCoreScopeStmt(fields_obj, zone, ctx); - case ScopeKind::AutoInCore: - return DeserializeAutoInCoreScopeStmt(fields_obj, zone, ctx); case ScopeKind::Cluster: return DeserializeClusterScopeStmt(fields_obj, zone, ctx); case ScopeKind::Hierarchy: @@ -937,9 +912,6 @@ static TypeRegistrar _yield_stmt_registrar("YieldStmt", DeserializeYieldStmt); static TypeRegistrar _return_stmt_registrar("ReturnStmt", DeserializeReturnStmt); static TypeRegistrar _for_stmt_registrar("ForStmt", DeserializeForStmt); static TypeRegistrar _while_stmt_registrar("WhileStmt", DeserializeWhileStmt); -static TypeRegistrar _in_core_scope_stmt_registrar("InCoreScopeStmt", DeserializeInCoreScopeStmt); -static TypeRegistrar _auto_in_core_scope_stmt_registrar("AutoInCoreScopeStmt", - DeserializeAutoInCoreScopeStmt); static TypeRegistrar _cluster_scope_stmt_registrar("ClusterScopeStmt", DeserializeClusterScopeStmt); static TypeRegistrar _hierarchy_scope_stmt_registrar("HierarchyScopeStmt", DeserializeHierarchyScopeStmt); static TypeRegistrar _spmd_scope_stmt_registrar("SpmdScopeStmt", DeserializeSpmdScopeStmt); diff --git a/src/ir/stmt.cpp b/src/ir/stmt.cpp index 31bac4898..55860d671 100644 --- a/src/ir/stmt.cpp +++ b/src/ir/stmt.cpp @@ -11,6 +11,25 @@ #include "pypto/ir/stmt.h" +#include +#include +#include + +#include "pypto/ir/function.h" + namespace pypto { -namespace ir {} // namespace ir +namespace ir { + +HierarchyScopeStmt::HierarchyScopeStmt(Level level, std::optional role, std::optional split, + std::string name_hint, StmtPtr body, Span span, + std::vector leading_comments) + : ScopeStmt(std::move(name_hint), std::move(body), std::move(span), std::move(leading_comments)), + level_(level), + role_(role), + split_(split) { + CHECK(!split_.has_value() || level_ == Level::CORE_GROUP) + << "HierarchyScopeStmt split is only valid at Level::CORE_GROUP"; +} + +} // namespace ir } // namespace pypto diff --git a/src/ir/transforms/convert_to_ssa_pass.cpp b/src/ir/transforms/convert_to_ssa_pass.cpp index 2bd1f7ff1..920691655 100644 --- a/src/ir/transforms/convert_to_ssa_pass.cpp +++ b/src/ir/transforms/convert_to_ssa_pass.cpp @@ -375,8 +375,7 @@ class SSAConverter { if (kind == ObjectKind::ReturnStmt) return ConvertReturn(As(s)); if (kind == ObjectKind::YieldStmt) return ConvertYield(As(s)); if (kind == ObjectKind::EvalStmt) return ConvertEval(As(s)); - if (kind == ObjectKind::InCoreScopeStmt || kind == ObjectKind::AutoInCoreScopeStmt || - kind == ObjectKind::ClusterScopeStmt || kind == ObjectKind::HierarchyScopeStmt || + if (kind == ObjectKind::ClusterScopeStmt || kind == ObjectKind::HierarchyScopeStmt || kind == ObjectKind::SpmdScopeStmt) { return ConvertScope(As(s)); } @@ -878,8 +877,6 @@ class SSAConverter { result->body_ = body; return result; }; - if (auto in_core = As(op)) return rewrite(in_core); - if (auto auto_in_core = As(op)) return rewrite(auto_in_core); if (auto cluster = As(op)) return rewrite(cluster); if (auto hier = As(op)) return rewrite(hier); if (auto spmd = As(op)) return rewrite(spmd); diff --git a/src/ir/transforms/flatten_call_expr_pass.cpp b/src/ir/transforms/flatten_call_expr_pass.cpp index e65f3cc6d..8e9eba128 100644 --- a/src/ir/transforms/flatten_call_expr_pass.cpp +++ b/src/ir/transforms/flatten_call_expr_pass.cpp @@ -57,8 +57,6 @@ class FlattenCallExprMutator : public IRMutator { StmtPtr VisitStmt_(const IfStmtPtr& op) override; StmtPtr VisitStmt_(const ForStmtPtr& op) override; StmtPtr VisitStmt_(const WhileStmtPtr& op) override; - StmtPtr VisitStmt_(const InCoreScopeStmtPtr& op) override; - StmtPtr VisitStmt_(const AutoInCoreScopeStmtPtr& op) override; StmtPtr VisitStmt_(const ClusterScopeStmtPtr& op) override; StmtPtr VisitStmt_(const HierarchyScopeStmtPtr& op) override; StmtPtr VisitStmt_(const SpmdScopeStmtPtr& op) override; @@ -317,19 +315,6 @@ StmtPtr FlattenScopeBody(FlattenCallExprMutator* self, std::vector& pen } } // namespace -StmtPtr FlattenCallExprMutator::VisitStmt_(const InCoreScopeStmtPtr& op) { - auto new_body = FlattenScopeBody(this, pending_stmts_, op->body_); - if (new_body.get() == op->body_.get()) return op; - return std::make_shared(op->split_, op->name_hint_, std::move(new_body), op->span_); -} - -StmtPtr FlattenCallExprMutator::VisitStmt_(const AutoInCoreScopeStmtPtr& op) { - auto new_body = FlattenScopeBody(this, pending_stmts_, op->body_); - if (new_body.get() == op->body_.get()) return op; - return std::make_shared(op->split_, op->name_hint_, std::move(new_body), - op->span_); -} - StmtPtr FlattenCallExprMutator::VisitStmt_(const ClusterScopeStmtPtr& op) { auto new_body = FlattenScopeBody(this, pending_stmts_, op->body_); if (new_body.get() == op->body_.get()) return op; @@ -339,7 +324,7 @@ StmtPtr FlattenCallExprMutator::VisitStmt_(const ClusterScopeStmtPtr& op) { StmtPtr FlattenCallExprMutator::VisitStmt_(const HierarchyScopeStmtPtr& op) { auto new_body = FlattenScopeBody(this, pending_stmts_, op->body_); if (new_body.get() == op->body_.get()) return op; - return std::make_shared(op->level_, op->role_, op->name_hint_, + return std::make_shared(op->level_, op->role_, op->split_, op->name_hint_, std::move(new_body), op->span_); } diff --git a/src/ir/transforms/flatten_tile_nd_to_2d_pass.cpp b/src/ir/transforms/flatten_tile_nd_to_2d_pass.cpp index 35862a990..6c0552309 100644 --- a/src/ir/transforms/flatten_tile_nd_to_2d_pass.cpp +++ b/src/ir/transforms/flatten_tile_nd_to_2d_pass.cpp @@ -279,11 +279,7 @@ std::vector TransformBody(const std::vector& stmts, FlattenCon new_scope->body_ = new_body; return new_scope; }; - if (auto in_core = As(stmt)) { - result.push_back(rewrite(in_core)); - } else if (auto auto_in_core = As(stmt)) { - result.push_back(rewrite(auto_in_core)); - } else if (auto cluster = As(stmt)) { + if (auto cluster = As(stmt)) { result.push_back(rewrite(cluster)); } else if (auto hier = As(stmt)) { result.push_back(rewrite(hier)); diff --git a/src/ir/transforms/fuse_create_assemble_to_slice_pass.cpp b/src/ir/transforms/fuse_create_assemble_to_slice_pass.cpp index d0b8b7a45..d4e511c8d 100644 --- a/src/ir/transforms/fuse_create_assemble_to_slice_pass.cpp +++ b/src/ir/transforms/fuse_create_assemble_to_slice_pass.cpp @@ -515,7 +515,7 @@ ProgramPtr TransformFuseCreateAssembleToSlice(const ProgramPtr& program) { return std::make_shared(std::move(new_functions), program->name_, program->span_); } -inline const PassProperties kFuseCreateAssembleToSliceProperties{.required = {IRProperty::SplitIncoreOrch}}; +inline const PassProperties kFuseCreateAssembleToSliceProperties{.required = {IRProperty::HierarchyOutlined}}; } // namespace diff --git a/src/ir/transforms/interchange_chunk_loops_pass.cpp b/src/ir/transforms/interchange_chunk_loops_pass.cpp deleted file mode 100644 index a84f269fb..000000000 --- a/src/ir/transforms/interchange_chunk_loops_pass.cpp +++ /dev/null @@ -1,887 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "pypto/core/error.h" -#include "pypto/core/logging.h" -#include "pypto/ir/core.h" -#include "pypto/ir/expr.h" -#include "pypto/ir/function.h" -#include "pypto/ir/program.h" -#include "pypto/ir/span.h" -#include "pypto/ir/stmt.h" -#include "pypto/ir/transforms/base/mutator.h" -#include "pypto/ir/transforms/base/visitor.h" -#include "pypto/ir/transforms/pass_properties.h" -#include "pypto/ir/transforms/passes.h" -#include "pypto/ir/transforms/utils/auto_name_utils.h" -#include "pypto/ir/transforms/utils/mutable_copy.h" -#include "pypto/ir/transforms/utils/transform_utils.h" -#include "pypto/ir/verifier/verifier.h" - -namespace pypto { -namespace ir { - -using Attrs = std::vector>; - -namespace { - -/// Build attrs for a generated loop: copy original attrs (excluding loop_origin) and set the new origin. -Attrs MakeLoopAttrs(const Attrs& original_attrs, LoopOrigin origin) { - Attrs result; - for (const auto& [key, value] : original_attrs) { - if (key != "loop_origin") result.emplace_back(key, value); - } - result.emplace_back("loop_origin", origin); - return result; -} - -/** - * @brief A single entry in a chunk-loop chain. - */ -struct ChainEntry { - ForStmtPtr for_stmt; - LoopOrigin origin; -}; - -/** - * @brief Check if a statement body contains a ScopeStmt(InCore). - */ -static bool ContainsInCoreScope(const StmtPtr& stmt) { - if (!stmt) return false; - - auto kind = stmt->GetKind(); - switch (kind) { - case ObjectKind::InCoreScopeStmt: - return true; - case ObjectKind::AutoInCoreScopeStmt: - case ObjectKind::ClusterScopeStmt: - case ObjectKind::HierarchyScopeStmt: - case ObjectKind::SpmdScopeStmt: { - auto scope = std::static_pointer_cast(stmt); - return ContainsInCoreScope(scope->body_); - } - case ObjectKind::SeqStmts: { - auto seq = std::static_pointer_cast(stmt); - for (const auto& s : seq->stmts_) { - if (ContainsInCoreScope(s)) return true; - } - return false; - } - case ObjectKind::ForStmt: { - auto for_stmt = std::static_pointer_cast(stmt); - return ContainsInCoreScope(for_stmt->body_); - } - default: - return false; - } -} - -static bool IsComputeTensorOp(const std::string& op_name) { - return transform_utils::IsComputeTensorOp(op_name); -} - -class ComputeTensorOpDetector : public IRVisitor { - public: - [[nodiscard]] bool Found() const { return found_; } - - void VisitExpr_(const CallPtr& op) override { - if (!op || found_) return; - if (op->op_ && IsComputeTensorOp(op->op_->name_)) { - found_ = true; - return; - } - IRVisitor::VisitExpr_(op); - } - - private: - bool found_ = false; -}; - -static bool ContainsComputeTensorOp(const StmtPtr& stmt) { - if (!stmt) return false; - ComputeTensorOpDetector detector; - detector.VisitStmt(stmt); - return detector.Found(); -} - -/// Detects whether an expression tree contains any sub-expression with TensorType or TileType. -class TensorOrTileTypedExprDetector : public IRVisitor { - public: - [[nodiscard]] bool Found() const { return found_; } - - void VisitExpr(const ExprPtr& expr) override { - if (!expr || found_) return; - auto type = expr->GetType(); - if (type) { - auto kind = type->GetKind(); - if (kind == ObjectKind::TensorType || kind == ObjectKind::TileType) { - found_ = true; - return; - } - } - IRVisitor::VisitExpr(expr); - } - - private: - bool found_ = false; -}; - -/// Returns true if stmt is an AssignStmt with a scalar-typed target variable -/// and a value expression that involves no tensor/tile data. -static bool IsPureScalarAssignment(const StmtPtr& stmt) { - if (!stmt) return false; - - auto kind = stmt->GetKind(); - if (kind == ObjectKind::AssignStmt) { - auto assign = std::static_pointer_cast(stmt); - auto var_type = assign->var_->GetType(); - if (!var_type || var_type->GetKind() != ObjectKind::ScalarType) return false; - TensorOrTileTypedExprDetector detector; - detector.VisitExpr(assign->value_); - return !detector.Found(); - } - - return false; -} - -static bool ContainsChunkLoop(const StmtPtr& stmt) { - if (!stmt) return false; - - auto kind = stmt->GetKind(); - switch (kind) { - case ObjectKind::ForStmt: { - auto for_stmt = std::static_pointer_cast(stmt); - return for_stmt->GetAttr("loop_origin") != LoopOrigin::Original || - ContainsChunkLoop(for_stmt->body_); - } - case ObjectKind::SeqStmts: { - auto seq = std::static_pointer_cast(stmt); - for (const auto& s : seq->stmts_) { - if (ContainsChunkLoop(s)) return true; - } - return false; - } - case ObjectKind::InCoreScopeStmt: - case ObjectKind::AutoInCoreScopeStmt: - case ObjectKind::ClusterScopeStmt: - case ObjectKind::HierarchyScopeStmt: - case ObjectKind::SpmdScopeStmt: { - auto scope = std::static_pointer_cast(stmt); - return ContainsChunkLoop(scope->body_); - } - default: - return false; - } -} - -/** - * @brief Check whether a statement needs an InCore wrapper after auto_incore is consumed. - * - * We only wrap statements that still need outlining: - * - compute tensor ops - * - chunk loops that failed interchange or remain sequential - * - * The following stay in orchestration (not wrapped): - * - Pure host-side groups (tensor.assemble/create/slice) - * - Pure scalar assignments (e.g., index arithmetic like `offset = ob * 32`) - * whose value expression contains no tensor/tile-typed sub-expressions - */ -static bool NeedsInCoreWrapping(const StmtPtr& stmt) { - if (!stmt) return false; - - auto kind = stmt->GetKind(); - if (kind == ObjectKind::YieldStmt || kind == ObjectKind::ReturnStmt) return false; - if (ContainsInCoreScope(stmt)) return false; - if (IsPureScalarAssignment(stmt)) return false; - - return ContainsChunkLoop(stmt) || ContainsComputeTensorOp(stmt); -} - -/** - * @brief Wrap statements that lack InCore coverage in ScopeStmt(InCore). - * - * After InterchangeChunkLoops processes the auto_incore body, some statements - * (standalone tensor ops, non-chunked loops, failed-interchange chains) may - * lack InCore wrapping. This function groups consecutive such statements and - * wraps each group in ScopeStmt(InCore). - * - * Control flow statements (YieldStmt, ReturnStmt) are never wrapped. - */ -static StmtPtr WrapNonIncoreStatementsInInCore(const StmtPtr& body, const Span& span, - std::optional split = std::nullopt) { - // When a ForStmt contains InCore scopes in its body (e.g. a pl.range loop - // wrapping interchanged parallel chunks), recurse into it so that non-InCore - // statements *inside* the loop body also get wrapped. - auto maybe_recurse_into_compound = [&](const StmtPtr& s) -> StmtPtr { - auto fs = std::dynamic_pointer_cast(s); - if (fs && ContainsInCoreScope(fs->body_)) { - auto new_body = WrapNonIncoreStatementsInInCore(fs->body_, span, split); - if (new_body.get() != fs->body_.get()) { - auto new_for = MutableCopy(fs); - new_for->body_ = new_body; - return new_for; - } - } - return s; - }; - - auto seq = std::dynamic_pointer_cast(body); - if (!seq) { - if (NeedsInCoreWrapping(body)) { - return std::make_shared(split, "", body, span); - } - return maybe_recurse_into_compound(body); - } - - // Check if any wrapping or recursion is needed (fast path) - bool has_work = false; - for (const auto& s : seq->stmts_) { - if (NeedsInCoreWrapping(s)) { - has_work = true; - break; - } - auto fs = std::dynamic_pointer_cast(s); - if (fs && ContainsInCoreScope(fs->body_)) { - has_work = true; - break; - } - } - if (!has_work) return body; - - // Group consecutive wrappable statements and wrap each group in InCore - std::vector result; - std::vector pending; - - auto flush = [&]() { - if (pending.empty()) return; - StmtPtr content = SeqStmts::Flatten(std::vector(pending), span); - result.push_back(std::make_shared(split, "", content, span)); - pending.clear(); - }; - - for (const auto& s : seq->stmts_) { - if (NeedsInCoreWrapping(s)) { - pending.push_back(s); - } else { - flush(); - result.push_back(maybe_recurse_into_compound(s)); - } - } - flush(); - - return SeqStmts::Flatten(std::move(result), span); -} - -/** - * @brief Mutator that interchanges ChunkOuter/ChunkInner loops and inserts InCore scopes. - * - * After SplitChunkedLoops produces nested ChunkOuter → ChunkInner pairs, - * this pass reorders them so all outers are on top, wraps inners + body - * in ScopeStmt(InCore). - * - * Only interchanges when ALL ChunkInner loops in the chain have ForKind::Parallel. - */ -class InterchangeChunkLoopsMutator : public IRMutator { - public: - ExprPtr VisitExpr_(const VarPtr& op) override { - auto it = substitution_map_.find(op.get()); - if (it != substitution_map_.end()) { - return it->second; - } - return op; - } - - ExprPtr VisitExpr_(const IterArgPtr& op) override { - auto it = substitution_map_.find(op.get()); - if (it != substitution_map_.end()) { - return it->second; - } - return IRMutator::VisitExpr_(op); - } - - StmtPtr VisitStmt_(const AutoInCoreScopeStmtPtr& op) override { - bool prev = inside_auto_incore_; - auto prev_split = current_split_; - inside_auto_incore_ = true; - current_split_ = op->split_; - auto new_body = VisitStmt(op->body_); - inside_auto_incore_ = prev; - current_split_ = prev_split; - // Consume the AutoInCore wrapper — return body directly. - // Wrap any statements that lack InCore coverage, propagating split. - new_body = WrapNonIncoreStatementsInInCore(new_body, op->span_, op->split_); - return new_body; - } - - StmtPtr VisitStmt_(const ForStmtPtr& op) override { - if (!inside_auto_incore_) { - return IRMutator::VisitStmt_(op); - } - - auto loop_origin = op->GetAttr("loop_origin"); - if (loop_origin == LoopOrigin::ChunkOuter) { - return HandleChunkOuter(op); - } - - if (loop_origin == LoopOrigin::ChunkRemainder) { - return HandleChunkRemainder(op); - } - - // Non-chunk loop: recurse normally - return IRMutator::VisitStmt_(op); - } - - StmtPtr VisitStmt_(const SeqStmtsPtr& op) override { - std::vector new_stmts; - bool changed = false; - - for (const auto& stmt : op->stmts_) { - auto new_stmt = VisitStmt(stmt); - if (new_stmt.get() != stmt.get()) { - changed = true; - } - // Flatten nested SeqStmts - auto seq = std::dynamic_pointer_cast(new_stmt); - if (seq) { - for (const auto& inner : seq->stmts_) { - new_stmts.push_back(inner); - } - } else { - new_stmts.push_back(new_stmt); - } - } - - if (!changed) { - return op; - } - return SeqStmts::Flatten(std::move(new_stmts), op->span_); - } - - private: - bool inside_auto_incore_ = false; - bool inside_incore_context_ = false; - std::optional current_split_; - std::unordered_map substitution_map_; - - /** - * @brief Visit a body that will be placed inside an InCore scope. - * - * Sets inside_incore_context_ so nested chains skip their own InCore wrapping. - * Returns whether a parent chain already provides InCore context (prev value). - */ - std::pair VisitBodyInIncoreContext(const StmtPtr& body) { - bool prev_incore = inside_incore_context_; - inside_incore_context_ = true; - auto result = VisitStmt(body); - inside_incore_context_ = prev_incore; - return {result, prev_incore}; - } - - /** - * @brief Collect a chain of chunk loops starting from a ChunkOuter. - * - * Walk into nested ForStmt bodies, collecting (ForStmt, LoopOrigin) entries. - * Stop at non-ForStmt or Original loop. - */ - static std::vector CollectChunkChain(const ForStmtPtr& start) { - std::vector chain; - chain.push_back({start, start->GetAttr("loop_origin")}); - - StmtPtr body = start->body_; - - // Walk through SeqStmts to find the actual ForStmt body - // (body can be SeqStmts with [for_loop, yield]) - while (true) { - ForStmtPtr next_for; - auto seq = std::dynamic_pointer_cast(body); - if (seq) { - // Verify body is exactly {ForStmt} or {ForStmt, YieldStmt} - // to ensure no side-effect statements are dropped during rebuild - size_t for_count = 0; - size_t yield_count = 0; - for (const auto& s : seq->stmts_) { - auto f = std::dynamic_pointer_cast(s); - if (f) { - next_for = f; - ++for_count; - } else if (s->GetKind() == ObjectKind::YieldStmt) { - ++yield_count; - } else { - // Non-loop, non-yield statement found — not safe to interchange - return chain; - } - } - if (for_count != 1 || yield_count > 1) { - return chain; - } - } else { - next_for = std::dynamic_pointer_cast(body); - } - - if (!next_for) break; - auto next_origin = next_for->GetAttr("loop_origin"); - if (next_origin == LoopOrigin::Original) break; - - chain.push_back({next_for, next_origin}); - body = next_for->body_; - } - - return chain; - } - - /** - * @brief Handle a ChunkOuter loop: collect chain, check guards, interchange if applicable. - */ - StmtPtr HandleChunkOuter(const ForStmtPtr& op) { - auto chain = CollectChunkChain(op); - - // Separate into outers and inners - std::vector outers; - std::vector inners; - for (const auto& entry : chain) { - if (entry.origin == LoopOrigin::ChunkOuter) { - outers.push_back(entry.for_stmt); - } else if (entry.origin == LoopOrigin::ChunkInner) { - inners.push_back(entry.for_stmt); - } - } - - // Guard: need at least 1 outer and 1 inner - if (outers.empty() || inners.empty()) { - return IRMutator::VisitStmt_(op); - } - - // Guard: all loops in the chain must have compatible iter_arg arity - const size_t ref_iter_args_size = chain.front().for_stmt->iter_args_.size(); - for (const auto& entry : chain) { - if (entry.for_stmt->iter_args_.size() != ref_iter_args_size) { - return IRMutator::VisitStmt_(op); - } - } - - // Guard: all ChunkInner loops must be Parallel - for (const auto& inner : inners) { - if (inner->kind_ != ForKind::Parallel) { - return IRMutator::VisitStmt_(op); - } - } - - // Guard: no existing InCore scope in innermost body - const auto& innermost = chain.back().for_stmt; - if (ContainsInCoreScope(innermost->body_)) { - return IRMutator::VisitStmt_(op); - } - - // Warn if this interchange is nested inside a parent chain's InCore context - if (inside_incore_context_) { - LOG_WARN << op->span_.filename_ << ":" << op->span_.begin_line_ << " — " - << "Nested chunked parallel loop found with intervening statements between it and its parent " - << "chunked parallel — the inner chunk will share the parent's InCore scope instead of " - << "getting its own. Consider removing the intervening statements or restructuring the loop " - << "nest so the chunked parallels are directly nested."; - } - - // Perform the interchange - return RebuildInterchanged(outers, inners, chain, op->span_); - } - - /** - * @brief Handle a ChunkRemainder loop: recurse into body and wrap sub-remainder loops in InCore. - * - * After recursion handles nested chunk chains (via HandleChunkOuter), scan the visited body - * for standalone parallel ChunkRemainder sub-loops and wrap each in InCore. - */ - StmtPtr HandleChunkRemainder(const ForStmtPtr& op) { - // Create new iter_args BEFORE visiting the body, and register old->new - // IterArg mappings in substitution_map_ so body references get rewritten. - std::vector new_iter_args; - bool iter_args_changed = false; - new_iter_args.reserve(op->iter_args_.size()); - for (const auto& ia : op->iter_args_) { - auto new_init = VisitExpr(ia->initValue_); - if (new_init.get() != ia->initValue_.get()) { - auto new_ia = std::make_shared(ia->name_hint_, ia->GetType(), new_init, ia->span_); - new_iter_args.push_back(new_ia); - // Register old -> new mapping so body references get rewritten - substitution_map_[ia.get()] = new_ia; - iter_args_changed = true; - } else { - new_iter_args.push_back(ia); - } - } - - // Recurse into the remainder body to handle nested chunk chains - auto new_body = VisitStmt(op->body_); - - // Wrap standalone parallel ChunkRemainder sub-loops in InCore - new_body = WrapSubRemainderLoopsInInCore(new_body, op->span_, current_split_); - - if (new_body.get() == op->body_.get() && !iter_args_changed) { - return op; - } - - auto new_for = MutableCopy(op); - new_for->iter_args_ = new_iter_args; - new_for->body_ = new_body; - return new_for; - } - - /** - * @brief Wrap standalone parallel ChunkRemainder ForStmts in InCore scopes. - * - * Scans top-level statements in body and wraps each ChunkRemainder loop that is - * Parallel and whose body doesn't already contain InCore. - */ - static StmtPtr WrapSubRemainderLoopsInInCore(const StmtPtr& body, const Span& span, - std::optional split = std::nullopt) { - auto should_wrap = [](const StmtPtr& s) -> bool { - auto fs = std::dynamic_pointer_cast(s); - return fs && fs->GetAttr("loop_origin") == LoopOrigin::ChunkRemainder && - fs->kind_ == ForKind::Parallel && !ContainsInCoreScope(fs->body_); - }; - - auto seq = std::dynamic_pointer_cast(body); - if (seq) { - std::vector new_stmts; - bool changed = false; - for (const auto& s : seq->stmts_) { - if (should_wrap(s)) { - new_stmts.push_back(std::make_shared(split, "", s, span)); - changed = true; - } else { - new_stmts.push_back(s); - } - } - if (!changed) return body; - return SeqStmts::Flatten(std::move(new_stmts), span); - } - - // Single statement - if (should_wrap(body)) { - return std::make_shared(split, "", body, span); - } - return body; - } - - /** - * @brief Rebuild the interchanged loop nest: outers on top, InCore { inners → body }. - * - * Original chain: O1 → I1 → O2 → I2 → body - * Result: O1 → O2 → InCore{ I1 → I2 → body } - * - * Iter_args are reconnected so the linear data flow is maintained: - * O1.init(original) → O2.init(from O1 iter_arg) → I1.init(from O2 iter_arg) - * → I2.init(from I1 iter_arg) → body - * Yields reverse the data flow back out. - */ - StmtPtr RebuildInterchanged(const std::vector& outers, const std::vector& inners, - const std::vector& chain, const Span& span) { - bool has_iter_args = !chain[0].for_stmt->iter_args_.empty(); - - if (!has_iter_args) { - return RebuildSimple(outers, inners, chain, span); - } - - return RebuildWithIterArgs(outers, inners, chain, span); - } - - /** - * @brief Simple rebuild without iter_args. - */ - StmtPtr RebuildSimple(const std::vector& outers, const std::vector& inners, - const std::vector& chain, const Span& span) { - // Get the body from the last loop in inners (not chain.back(), which may be a remainder) - const auto& innermost = inners.back(); - - auto [body, prev_incore] = VisitBodyInIncoreContext(innermost->body_); - - // Build inners inside-out - StmtPtr current = body; - for (int i = static_cast(inners.size()) - 1; i >= 0; --i) { - const auto& inner = inners[i]; - current = std::make_shared(inner->loop_var_, inner->start_, inner->stop_, inner->step_, - std::vector{}, current, std::vector{}, - inner->span_, inner->kind_, std::nullopt, - MakeLoopAttrs(inner->attrs_, LoopOrigin::ChunkInner)); - } - - // Wrap in InCore — skip if a parent chain already provides InCore context - if (!prev_incore) { - current = std::make_shared(current_split_, "", current, span); - } - - // Build outers inside-out, preserving the original ForKind. - for (int i = static_cast(outers.size()) - 1; i >= 0; --i) { - const auto& outer = outers[i]; - current = std::make_shared(outer->loop_var_, outer->start_, outer->stop_, outer->step_, - std::vector{}, current, std::vector{}, - outer->span_, outer->kind_, std::nullopt, - MakeLoopAttrs(outer->attrs_, LoopOrigin::ChunkOuter)); - } - - return current; - } - - /** - * @brief Rebuild with iter_args, reconnecting the SSA data flow. - * - * Original chain passes iter_args linearly through nested loops: - * O1.init(x_0) → I1.init(from O1_ia) → O2.init(from I1_ia) → I2.init(from O2_ia) → body - * - * After interchange: O1 → O2 → InCore{ I1 → I2 → body } - * New data flow: - * O1.init(x_0) → O2.init(from O1_ia) → I1.init(from O2_ia) → I2.init(from I1_ia) → body - */ - StmtPtr RebuildWithIterArgs(const std::vector& outers, const std::vector& inners, - const std::vector& chain, const Span& span) { - // Reorder the chain entries: outers first, then inners - std::vector reordered; - reordered.reserve(outers.size() + inners.size()); - for (const auto& o : outers) reordered.push_back(o); - for (const auto& i : inners) reordered.push_back(i); - - size_t num_iter_args = chain[0].for_stmt->iter_args_.size(); - size_t total_loops = reordered.size(); - - // Create fresh iter_args and return_vars for each loop in the reordered chain - std::vector> new_iter_args(total_loops); - std::vector> new_return_vars(total_loops); - - // The outermost loop gets the original init values from the first chain entry - const auto& first_orig = chain[0].for_stmt; - - for (size_t loop_idx = 0; loop_idx < total_loops; ++loop_idx) { - const auto& orig_loop = reordered[loop_idx]; - for (size_t ia_idx = 0; ia_idx < num_iter_args; ++ia_idx) { - const auto& orig_ia = first_orig->iter_args_[ia_idx]; - auto parsed_name = auto_name::Parse(orig_ia->name_hint_); - std::string loop_qualifier = auto_name::LoopLevelQualifier(static_cast(loop_idx)); - std::string combined_qualifier = - parsed_name.qualifier.empty() ? loop_qualifier : parsed_name.qualifier + "_" + loop_qualifier; - std::string ia_name = - auto_name::BuildName(parsed_name.base_name, combined_qualifier, "iter", parsed_name.version); - std::string rv_name = - auto_name::BuildName(parsed_name.base_name, combined_qualifier, "rv", parsed_name.version); - - ExprPtr init_value; - if (loop_idx == 0) { - // Outermost: use original init values (apply substitutions for nested chains) - init_value = VisitExpr(orig_ia->initValue_); - } else { - // Chain from previous loop's iter_arg - init_value = new_iter_args[loop_idx - 1][ia_idx]; - } - - auto new_ia = std::make_shared(ia_name, orig_ia->GetType(), init_value, orig_ia->span_); - auto new_rv = std::make_shared(rv_name, orig_ia->GetType(), orig_ia->span_); - - new_iter_args[loop_idx].push_back(new_ia); - new_return_vars[loop_idx].push_back(new_rv); - } - } - - // Now set up substitutions for the body: - // The last loop in reordered (last inner) passes its iter_args to the body. - // We remap its original iter_args to the new innermost iter_args. - // Note: chain.back() may be a ChunkRemainder that is NOT in reordered, - // so we must use reordered.back() to get the actual innermost interchange loop. - const auto& orig_innermost = reordered.back(); - size_t innermost_reordered_idx = total_loops - 1; - - for (size_t ia_idx = 0; ia_idx < num_iter_args; ++ia_idx) { - substitution_map_[orig_innermost->iter_args_[ia_idx].get()] = - new_iter_args[innermost_reordered_idx][ia_idx]; - } - - // Visit the innermost body with substitutions - auto [body, prev_incore] = VisitBodyInIncoreContext(orig_innermost->body_); - - // Build the loop nest inside-out, starting from the innermost (last in reordered) - StmtPtr current = body; - - for (int i = static_cast(total_loops) - 1; i >= 0; --i) { - const auto& orig_loop = reordered[i]; - bool is_inner = (orig_loop->GetAttr("loop_origin") == LoopOrigin::ChunkInner); - - // Build yield for this loop from the inner loop's return_vars - // (or body's yield values for the innermost) - if (!new_return_vars[i].empty()) { - std::vector yield_values; - if (i < static_cast(total_loops) - 1) { - // Yield the return vars of the next inner loop - for (const auto& rv : new_return_vars[i + 1]) { - yield_values.push_back(rv); - } - } else { - // Innermost: body already contains yield, current already has it - // Don't add extra yield - } - - if (!yield_values.empty()) { - auto yield_stmt = std::make_shared(yield_values, span); - current = SeqStmts::Flatten(std::vector{current, yield_stmt}, span); - } - } - - current = std::make_shared( - orig_loop->loop_var_, orig_loop->start_, orig_loop->stop_, orig_loop->step_, new_iter_args[i], - current, new_return_vars[i], orig_loop->span_, orig_loop->kind_, std::nullopt, - MakeLoopAttrs(orig_loop->attrs_, is_inner ? LoopOrigin::ChunkInner : LoopOrigin::ChunkOuter)); - - // Insert InCore scope right after building all inners (at the boundary). - // Skip if a parent chain already provides InCore context. - if (!prev_incore && !is_inner && i + 1 < static_cast(total_loops) && - reordered[i + 1]->GetAttr("loop_origin") == LoopOrigin::ChunkInner) { - // The current ForStmt body already contains the inner loops. - // We need to wrap the inner loop nest (current's body) in InCore. - // But current IS the outermost outer that contains inners already. - // Actually, we need to insert InCore between the last outer and first inner. - // Let's restructure: wrap the body of this outer in InCore. - auto outer_for = std::static_pointer_cast(current); - - // Extract the body (which is inners + yield) - auto incore_body = outer_for->body_; - // Separate the yield at the end from the body content - auto body_seq = std::dynamic_pointer_cast(incore_body); - if (body_seq && body_seq->stmts_.size() >= 2) { - // Last stmt should be yield, rest goes into InCore - std::vector incore_stmts; - incore_stmts.reserve(body_seq->stmts_.size() - 1); - for (size_t si = 0; si < body_seq->stmts_.size() - 1; ++si) { - incore_stmts.push_back(body_seq->stmts_[si]); - } - auto last_stmt = body_seq->stmts_.back(); - - StmtPtr incore_content; - if (incore_stmts.size() == 1) { - incore_content = incore_stmts[0]; - } else { - incore_content = SeqStmts::Flatten(std::move(incore_stmts), span); - } - - auto incore_scope = std::make_shared(current_split_, "", incore_content, span); - auto new_body = SeqStmts::Flatten(std::vector{incore_scope, last_stmt}, span); - - current = std::make_shared( - outer_for->loop_var_, outer_for->start_, outer_for->stop_, outer_for->step_, - outer_for->iter_args_, new_body, outer_for->return_vars_, outer_for->span_, outer_for->kind_, - std::nullopt, MakeLoopAttrs(outer_for->attrs_, LoopOrigin::ChunkOuter)); - } else { - // No yield, wrap entire body - auto incore_scope = std::make_shared(current_split_, "", incore_body, span); - current = std::make_shared( - outer_for->loop_var_, outer_for->start_, outer_for->stop_, outer_for->step_, - outer_for->iter_args_, incore_scope, outer_for->return_vars_, outer_for->span_, - outer_for->kind_, std::nullopt, MakeLoopAttrs(outer_for->attrs_, LoopOrigin::ChunkOuter)); - } - } - } - - // Remap original outer return_vars to new outermost return_vars - for (size_t ia_idx = 0; ia_idx < num_iter_args; ++ia_idx) { - substitution_map_[first_orig->return_vars_[ia_idx].get()] = new_return_vars[0][ia_idx]; - } - - return current; - } -}; - -/** - * @brief Transform a function by interchanging chunk loops and inserting InCore scopes. - */ -FunctionPtr TransformInterchangeChunkLoops(const FunctionPtr& func) { - INTERNAL_CHECK(func) << "InterchangeChunkLoops cannot run on null function"; - - InterchangeChunkLoopsMutator mutator; - auto new_body = mutator.VisitStmt(func->body_); - - if (new_body.get() == func->body_.get()) { - return func; - } - - auto new_func = MutableCopy(func); - new_func->body_ = new_body; - return new_func; -} - -} // namespace - -// Factory function -namespace pass { -Pass InterchangeChunkLoops() { - return CreateFunctionPass(TransformInterchangeChunkLoops, "InterchangeChunkLoops", - kInterchangeChunkLoopsProperties); -} -} // namespace pass - -// ============================================================================ -// NoNestedInCore structural property verifier -// ============================================================================ - -namespace { - -constexpr int kNestedIncoreCode = 501; - -/// Detects nested ScopeStmt(InCore) scopes in an IR tree. -class NestedInCoreScopeDetector : public IRVisitor { - public: - explicit NestedInCoreScopeDetector(std::vector& diagnostics) : diagnostics_(diagnostics) {} - - void VisitStmt_(const InCoreScopeStmtPtr& op) override { - if (!op) return; - if (inside_incore_) { - diagnostics_.emplace_back(DiagnosticSeverity::Error, "NoNestedInCore", kNestedIncoreCode, - "Nested InCore scope detected — InCore scopes must not contain other " - "InCore scopes", - op->span_); - } - bool prev = inside_incore_; - inside_incore_ = true; - IRVisitor::VisitStmt_(op); - inside_incore_ = prev; - } - - private: - std::vector& diagnostics_; - bool inside_incore_ = false; -}; - -} // namespace - -class NoNestedIncorePropertyVerifierImpl : public PropertyVerifier { - public: - [[nodiscard]] std::string GetName() const override { return "NoNestedInCore"; } - - void Verify(const ProgramPtr& program, std::vector& diagnostics) override { - if (!program) return; - for (const auto& [gv, func] : program->functions_) { - if (!func || !func->body_) continue; - NestedInCoreScopeDetector detector(diagnostics); - detector.VisitStmt(func->body_); - } - } -}; - -PropertyVerifierPtr CreateNoNestedIncorePropertyVerifier() { - return std::make_shared(); -} - -} // namespace ir -} // namespace pypto diff --git a/src/ir/transforms/ir_property.cpp b/src/ir/transforms/ir_property.cpp index 66cfc80e6..538928a35 100644 --- a/src/ir/transforms/ir_property.cpp +++ b/src/ir/transforms/ir_property.cpp @@ -33,8 +33,6 @@ std::string IRPropertyToString(IRProperty prop) { return "NormalizedStmtStructure"; case IRProperty::NoRedundantBlocks: return "NoRedundantBlocks"; - case IRProperty::SplitIncoreOrch: - return "SplitIncoreOrch"; case IRProperty::HasMemRefs: return "HasMemRefs"; case IRProperty::IncoreTileOps: @@ -61,8 +59,6 @@ std::string IRPropertyToString(IRProperty prop) { return "VectorKernelSplit"; case IRProperty::OutParamNotShadowed: return "OutParamNotShadowed"; - case IRProperty::NoNestedInCore: - return "NoNestedInCore"; case IRProperty::InOutUseValid: return "InOutUseValid"; default: @@ -132,8 +128,7 @@ VerificationLevel GetDefaultVerificationLevel() { const IRPropertySet& GetStructuralProperties() { static const IRPropertySet props{IRProperty::TypeChecked, IRProperty::BreakContinueValid, IRProperty::NoRedundantBlocks, IRProperty::UseAfterDef, - IRProperty::OutParamNotShadowed, IRProperty::NoNestedInCore, - IRProperty::InOutUseValid}; + IRProperty::OutParamNotShadowed, IRProperty::InOutUseValid}; return props; } @@ -144,8 +139,7 @@ const IRPropertySet& GetDefaultVerifyProperties() { IRProperty::BreakContinueValid, IRProperty::NoRedundantBlocks, IRProperty::UseAfterDef, - IRProperty::OutParamNotShadowed, - IRProperty::NoNestedInCore}; + IRProperty::OutParamNotShadowed}; return props; } diff --git a/src/ir/transforms/legalize_pto_buffer_reuse_pass.cpp b/src/ir/transforms/legalize_pto_buffer_reuse_pass.cpp index c46cdc99c..7e257df48 100644 --- a/src/ir/transforms/legalize_pto_buffer_reuse_pass.cpp +++ b/src/ir/transforms/legalize_pto_buffer_reuse_pass.cpp @@ -477,7 +477,7 @@ FunctionPtr TransformLegalizePTOBufferReuse(const FunctionPtr& func) { namespace pass { Pass LegalizePTOBufferReuse() { - static const PassProperties kProps{.required = {IRProperty::SplitIncoreOrch, IRProperty::IncoreTileOps, + static const PassProperties kProps{.required = {IRProperty::HierarchyOutlined, IRProperty::IncoreTileOps, IRProperty::HasMemRefs, IRProperty::TileOps2D}}; return CreateFunctionPass(TransformLegalizePTOBufferReuse, "LegalizePTOBufferReuse", kProps); } diff --git a/src/ir/transforms/memory_reuse_pass.cpp b/src/ir/transforms/memory_reuse_pass.cpp index 1f863f9b2..74236254a 100644 --- a/src/ir/transforms/memory_reuse_pass.cpp +++ b/src/ir/transforms/memory_reuse_pass.cpp @@ -118,8 +118,6 @@ class LifetimeAnalyzer : public IRVisitor { } } - void VisitStmt_(const InCoreScopeStmtPtr& op) override { VisitStmt(op->body_); } - void VisitStmt_(const AutoInCoreScopeStmtPtr& op) override { VisitStmt(op->body_); } void VisitStmt_(const ClusterScopeStmtPtr& op) override { VisitStmt(op->body_); } void VisitStmt_(const HierarchyScopeStmtPtr& op) override { VisitStmt(op->body_); } void VisitStmt_(const SpmdScopeStmtPtr& op) override { VisitStmt(op->body_); } diff --git a/src/ir/transforms/mutator.cpp b/src/ir/transforms/mutator.cpp index b6faa550f..5ffab9c12 100644 --- a/src/ir/transforms/mutator.cpp +++ b/src/ir/transforms/mutator.cpp @@ -589,30 +589,6 @@ StmtPtr IRMutator::VisitStmt_(const WhileStmtPtr& op) { return op; } -StmtPtr IRMutator::VisitStmt_(const InCoreScopeStmtPtr& op) { - INTERNAL_CHECK_SPAN(op->body_, op->span_) << "InCoreScopeStmt has null body"; - auto new_body = StmtFunctor::VisitStmt(op->body_); - INTERNAL_CHECK_SPAN(new_body, op->span_) << "InCoreScopeStmt body mutated to null"; - if (new_body.get() != op->body_.get()) { - auto result = MutableCopy(op); - result->body_ = std::move(new_body); - return result; - } - return op; -} - -StmtPtr IRMutator::VisitStmt_(const AutoInCoreScopeStmtPtr& op) { - INTERNAL_CHECK_SPAN(op->body_, op->span_) << "AutoInCoreScopeStmt has null body"; - auto new_body = StmtFunctor::VisitStmt(op->body_); - INTERNAL_CHECK_SPAN(new_body, op->span_) << "AutoInCoreScopeStmt body mutated to null"; - if (new_body.get() != op->body_.get()) { - auto result = MutableCopy(op); - result->body_ = std::move(new_body); - return result; - } - return op; -} - StmtPtr IRMutator::VisitStmt_(const ClusterScopeStmtPtr& op) { INTERNAL_CHECK_SPAN(op->body_, op->span_) << "ClusterScopeStmt has null body"; auto new_body = StmtFunctor::VisitStmt(op->body_); diff --git a/src/ir/transforms/outline_hierarchy_scopes_pass.cpp b/src/ir/transforms/outline_hierarchy_scopes_pass.cpp index 4276224d0..c5b37b4d6 100644 --- a/src/ir/transforms/outline_hierarchy_scopes_pass.cpp +++ b/src/ir/transforms/outline_hierarchy_scopes_pass.cpp @@ -29,11 +29,14 @@ namespace ir { namespace pass { /** - * @brief Pass to outline Hierarchy scopes into separate functions with level/role + * @brief Pass to outline non-CORE_GROUP Hierarchy scopes into separate functions. * - * This pass transforms ScopeStmt(Hierarchy) nodes into separate Function definitions - * that carry the scope's Level and Role metadata, and replaces the scope with a Call - * to the outlined function. + * This pass transforms HierarchyScopeStmt nodes whose `level_` is anything other + * than `Level::CORE_GROUP` into separate Function definitions that carry the + * scope's Level/Role metadata, and replaces the scope with a Call to the outlined + * function. CORE_GROUP scopes are intentionally left intact for the subsequent + * `OutlineIncoreScopes` pass, which emits `Function(InCore)` and promotes the + * parent function from `Opaque` to `Orchestration`. * * Requirements: * - Input IR must be in SSA form (run ConvertToSSA first) @@ -41,14 +44,13 @@ namespace pass { * - Should run before OutlineIncoreScopes and OutlineClusterScopes * * Transformation: - * 1. For each ScopeStmt(Hierarchy) in an Opaque function: - * - Analyze body to determine external variable references (inputs) - * - Analyze subsequent statements to determine which definitions are outputs - * - Extract body into new Function(Opaque, level, role) with appropriate params/returns - * - Replace scope with Call to the outlined function + output assignments - * 2. Recursively handles nested Hierarchy scopes - * 3. Add outlined functions to the program - * 4. Parent function type is preserved (not promoted) + * 1. For each HierarchyScopeStmt at level != CORE_GROUP in an Opaque function: + * - Analyze body for inputs/outputs + * - Extract body into a new Opaque Function carrying the scope's level/role + * - Replace the scope with a Call to the outlined function + output assignments + * 2. Recursively descends into other scopes; nested non-CORE_GROUP Hierarchy + * scopes are outlined together with their parent. + * 3. CORE_GROUP scopes (and their bodies) are preserved verbatim. */ Pass OutlineHierarchyScopes() { auto pass_func = [](const ProgramPtr& program) -> ProgramPtr { @@ -71,15 +73,20 @@ Pass OutlineHierarchyScopes() { } type_collector.VisitStmt(func->body_); - // Outline Hierarchy scopes in this function + // Outline non-CORE_GROUP Hierarchy scopes; CORE_GROUP scopes are skipped + // and handled by OutlineIncoreScopes downstream. + outline_utils::ScopeOutliner::HierarchyLevelFilter filter{ + Level::CORE_GROUP, outline_utils::ScopeOutliner::HierarchyLevelFilter::Mode::Exclude}; outline_utils::ScopeOutliner outliner(func->name_, type_collector.var_types, type_collector.var_objects, type_collector.known_names, ScopeKind::Hierarchy, - FunctionType::Opaque, "_hierarchy_"); + /*outlined_func_type=*/FunctionType::Opaque, "_hierarchy_", + /*program=*/nullptr, filter); auto new_body = outliner.VisitStmt(func->body_); - // Preserve parent function type (don't promote — hierarchy is orthogonal to FunctionType) auto new_func = MutableCopy(func); new_func->body_ = new_body; + // Parent type unchanged; CORE_GROUP-driven promotion to Orchestration + // happens in OutlineIncoreScopes. new_functions.push_back(new_func); const auto& outlined = outliner.GetOutlinedFunctions(); @@ -101,6 +108,10 @@ Pass OutlineHierarchyScopes() { // ============================================================================ // HierarchyOutlined property verifier // ============================================================================ +// +// This verifier is shared between OutlineHierarchyScopes and OutlineIncoreScopes. +// The HierarchyOutlined property is produced by OutlineIncoreScopes (which runs +// after OutlineHierarchyScopes), since CORE_GROUP scopes survive the first pass. namespace { @@ -116,9 +127,13 @@ class HierarchyOutlinedPropertyVerifierImpl : public PropertyVerifier { if (!program) return; for (const auto& [gv, func] : program->functions_) { if (!func || !func->body_) continue; - // Only check Opaque functions — the pass only processes Opaque functions, - // so Hierarchy scopes in other function types are not expected to be outlined. - if (func->func_type_ != FunctionType::Opaque) continue; + // After both outline passes have run, no Hierarchy scopes should remain in + // Opaque/Orchestration functions. Inside InCore/Group/Spmd outlined + // functions, Hierarchy scopes are disallowed by construction (the outliner + // only produces leaf scope bodies). + if (func->func_type_ != FunctionType::Opaque && func->func_type_ != FunctionType::Orchestration) { + continue; + } HierarchyOutlinedVerifier verifier(diagnostics, "HierarchyOutlined", "Hierarchy ScopeStmt found in function (should have been outlined)"); verifier.VisitStmt(func->body_); diff --git a/src/ir/transforms/outline_incore_scopes_pass.cpp b/src/ir/transforms/outline_incore_scopes_pass.cpp index 07028f048..ea7dfe956 100644 --- a/src/ir/transforms/outline_incore_scopes_pass.cpp +++ b/src/ir/transforms/outline_incore_scopes_pass.cpp @@ -10,21 +10,16 @@ */ #include -#include -#include #include -#include "pypto/core/error.h" -#include "pypto/ir/expr.h" #include "pypto/ir/function.h" #include "pypto/ir/program.h" #include "pypto/ir/stmt.h" +#include "pypto/ir/transforms/base/visitor.h" #include "pypto/ir/transforms/pass_properties.h" #include "pypto/ir/transforms/passes.h" #include "pypto/ir/transforms/utils/mutable_copy.h" #include "pypto/ir/transforms/utils/scope_outline_utils.h" -#include "pypto/ir/transforms/utils/transform_utils.h" -#include "pypto/ir/verifier/verifier.h" namespace pypto { namespace ir { @@ -32,38 +27,64 @@ namespace ir { namespace pass { /** - * @brief Pass to outline InCore scopes into separate functions + * @brief Pass to outline CORE_GROUP Hierarchy scopes into InCore functions. * - * This pass transforms ScopeStmt(InCore) nodes into separate Function(InCore) definitions - * and replaces the scope with a Call to the outlined function. + * This pass picks up where OutlineHierarchyScopes leaves off: it transforms + * every `HierarchyScopeStmt(level=CORE_GROUP)` that survived the previous pass + * into a separate `Function(InCore)` definition and replaces the scope with a + * `Call` to that function. When any CORE_GROUP scope is outlined out of an + * `Opaque` function, the parent function is promoted from `Opaque` to + * `Orchestration` so downstream tile-level passes see the canonical + * Orchestration → InCore call shape. * * Requirements: * - Input IR must be in SSA form (run ConvertToSSA first) - * - Only processes Opaque functions (InCore functions are left unchanged) + * - Should run after OutlineHierarchyScopes and before OutlineClusterScopes + * - Only processes Opaque functions * - * Transformation: - * 1. For each ScopeStmt(InCore) in an Opaque function: - * - Analyze body to determine external variable references (inputs) - * - Analyze subsequent statements to determine which definitions are outputs - * - Extract body into new Function(InCore) with appropriate params/returns - * - Replace scope with Call to the outlined function + output assignments - * - EvalStmt(store) calls on output tensors are converted to AssignStmt - * 2. Recursively handles nested InCore scopes - * 3. Add outlined functions to the program - * 4. Promote the parent function from Opaque to Orchestration + * Together with OutlineHierarchyScopes this pass establishes the + * `HierarchyOutlined` property: after both have run, no `HierarchyScopeStmt` + * remains in any Opaque/Orchestration function body. */ +namespace { + +/// Returns true iff any HierarchyScopeStmt at Level::CORE_GROUP appears under +/// the given statement. Used to decide whether to promote the parent function +/// from Opaque to Orchestration. +class CoreGroupHierarchyFinder : public IRVisitor { + public: + bool found = false; + + protected: + void VisitStmt_(const HierarchyScopeStmtPtr& op) override { + if (op->level_ == Level::CORE_GROUP) { + found = true; + } + IRVisitor::VisitStmt_(op); + } +}; + +} // namespace + Pass OutlineIncoreScopes() { auto pass_func = [](const ProgramPtr& program) -> ProgramPtr { std::vector new_functions; std::vector all_outlined_functions; for (const auto& [gvar, func] : program->functions_) { - // Only process Opaque functions (InCore functions are already outlined) + // Only Opaque functions can carry CORE_GROUP HierarchyScopeStmts at this + // point in the pipeline. if (func->func_type_ != FunctionType::Opaque) { new_functions.push_back(func); continue; } + // Detect CORE_GROUP scopes before outlining; outliner.GetOutlinedFunctions() + // tells us *what* was outlined, but we need the parent-promotion decision + // up front so it is symmetric with future filters. + CoreGroupHierarchyFinder finder; + finder.VisitStmt(func->body_); + // Build symbol table for this function outline_utils::VarCollector type_collector; for (const auto& var : func->params_) { @@ -73,29 +94,31 @@ Pass OutlineIncoreScopes() { } type_collector.VisitStmt(func->body_); - // Outline InCore scopes in this function + // Outline only HierarchyScopeStmts at CORE_GROUP into InCore functions. + outline_utils::ScopeOutliner::HierarchyLevelFilter filter{ + Level::CORE_GROUP, outline_utils::ScopeOutliner::HierarchyLevelFilter::Mode::Only}; outline_utils::ScopeOutliner outliner(func->name_, type_collector.var_types, type_collector.var_objects, - type_collector.known_names, ScopeKind::InCore, - FunctionType::InCore, "_incore_"); + type_collector.known_names, ScopeKind::Hierarchy, + /*outlined_func_type=*/FunctionType::InCore, "_incore_", + /*program=*/nullptr, filter); auto new_body = outliner.VisitStmt(func->body_); - // Create new function with transformed body. - // If any InCore scopes were outlined, promote Opaque -> Orchestration. - const auto& outlined = outliner.GetOutlinedFunctions(); - FunctionType new_func_type = outlined.empty() ? func->func_type_ : FunctionType::Orchestration; auto new_func = MutableCopy(func); new_func->body_ = new_body; - new_func->func_type_ = new_func_type; + if (finder.found) { + // Promote parent Opaque → Orchestration whenever any CORE_GROUP scope + // was outlined, matching the contract the former OutlineIncoreScopes + // (driven by InCoreScopeStmt) used to satisfy. + new_func->func_type_ = FunctionType::Orchestration; + } new_functions.push_back(new_func); - // Collect outlined functions (prepend before parent so inner functions come first) + const auto& outlined = outliner.GetOutlinedFunctions(); all_outlined_functions.insert(all_outlined_functions.end(), outlined.begin(), outlined.end()); } - // Add all outlined functions before the originals + // Outlined functions go before the originals so call sites can reference them. all_outlined_functions.insert(all_outlined_functions.end(), new_functions.begin(), new_functions.end()); - - // Create new program with all functions return std::make_shared(all_outlined_functions, program->name_, program->span_); }; @@ -103,69 +126,5 @@ Pass OutlineIncoreScopes() { } } // namespace pass - -// ============================================================================ -// SplitIncoreOrch property verifier -// ============================================================================ - -namespace { - -/** - * @brief Checks no InCore ScopeStmts remain in Opaque or Orchestration functions. - */ -using SplitIncoreOrchVerifier = outline_utils::ScopeKindAbsenceVerifier; - -static bool IsComputeTensorOp(const std::string& op_name) { - return transform_utils::IsComputeTensorOp(op_name); -} - -/// Checks Orchestration functions for compute tensor ops that should be in InCore. -class OrchComputeTensorOpVerifier : public IRVisitor { - public: - explicit OrchComputeTensorOpVerifier(std::vector& diagnostics) : diagnostics_(diagnostics) {} - - void VisitExpr_(const CallPtr& op) override { - if (op && op->op_ && IsComputeTensorOp(op->op_->name_)) { - diagnostics_.emplace_back(DiagnosticSeverity::Warning, "SplitIncoreOrch", 0, - "Compute tensor op '" + op->op_->name_ + - "' found in Orchestration function (should be inside InCore)", - op->span_); - } - IRVisitor::VisitExpr_(op); - } - - private: - std::vector& diagnostics_; -}; - -} // namespace - -class SplitIncoreOrchPropertyVerifierImpl : public PropertyVerifier { - public: - [[nodiscard]] std::string GetName() const override { return "SplitIncoreOrch"; } - - void Verify(const ProgramPtr& program, std::vector& diagnostics) override { - if (!program) return; - for (const auto& [gv, func] : program->functions_) { - if (!func || !func->body_) continue; - // Check Opaque and Orchestration functions — InCore functions are expected to have InCore content - if (func->func_type_ == FunctionType::InCore) continue; - SplitIncoreOrchVerifier verifier( - diagnostics, "SplitIncoreOrch", - "InCore ScopeStmt found in non-InCore function (should have been outlined)"); - verifier.VisitStmt(func->body_); - // Also check Orchestration functions for leaked compute tensor ops - if (func->func_type_ == FunctionType::Orchestration) { - OrchComputeTensorOpVerifier compute_verifier(diagnostics); - compute_verifier.VisitStmt(func->body_); - } - } - } -}; - -PropertyVerifierPtr CreateSplitIncoreOrchPropertyVerifier() { - return std::make_shared(); -} - } // namespace ir } // namespace pypto diff --git a/src/ir/transforms/python_printer.cpp b/src/ir/transforms/python_printer.cpp index 47505cc30..af577cd14 100644 --- a/src/ir/transforms/python_printer.cpp +++ b/src/ir/transforms/python_printer.cpp @@ -239,8 +239,6 @@ class IRPythonPrinter : public IRVisitor { void VisitStmt_(const ReturnStmtPtr& op) override; void VisitStmt_(const ForStmtPtr& op) override; void VisitStmt_(const WhileStmtPtr& op) override; - void VisitStmt_(const InCoreScopeStmtPtr& op) override; - void VisitStmt_(const AutoInCoreScopeStmtPtr& op) override; void VisitStmt_(const ClusterScopeStmtPtr& op) override; void VisitStmt_(const HierarchyScopeStmtPtr& op) override; void VisitStmt_(const SpmdScopeStmtPtr& op) override; @@ -912,9 +910,9 @@ void IRPythonPrinter::VisitStmt_(const ForStmtPtr& op) { VisitExpr(op->step_); } - // Unroll loops cannot have iter_args. The DSL parser forbids init_values for - // pl.unroll(), and SplitChunkedLoops preserves this: chunk-split unroll loops - // always take the simple (no iter_args) path. + // Unroll loops cannot have iter_args. The DSL parser forbids init_values + // for pl.unroll(), and no built-in pass produces an unroll loop with + // iter_args; printers that hit this branch indicate a malformed IR. if (op->kind_ == ForKind::Unroll && !op->iter_args_.empty()) { INTERNAL_CHECK_SPAN(false, op->span_) << "ForKind::Unroll does not support iter_args/init_values"; } @@ -1029,41 +1027,14 @@ void IRPythonPrinter::VisitStmt_(const WhileStmtPtr& op) { } void IRPythonPrinter::VisitStmt_(const HierarchyScopeStmtPtr& op) { - // Print as: with pl.at(level=pl.Level.X, role=pl.Role.Y, [name_hint="..."]): + // Print as: with pl.at(level=pl.Level.X, [role=...], [optimizations=[pl.split(...)]], [name_hint=...]): stream_ << "with " << prefix_ << ".at(level=" << prefix_ << ".Level." << LevelToString(op->level_); if (op->role_.has_value()) { stream_ << ", role=" << prefix_ << ".Role." << RoleToString(*op->role_); } - if (!op->name_hint_.empty()) { - stream_ << ", name_hint=\"" << op->name_hint_ << "\""; - } - stream_ << "):\n"; - IncreaseIndent(); - PrintStmtBlock(op->body_); - DecreaseIndent(); -} - -void IRPythonPrinter::VisitStmt_(const InCoreScopeStmtPtr& op) { - stream_ << "with " << prefix_ << ".at(level=" << prefix_ << ".Level.CORE_GROUP"; if (op->split_.has_value() && op->split_.value() != SplitMode::None) { - stream_ << ", split=" << prefix_ << ".SplitMode." << SplitModeToPythonString(op->split_.value()); - } - if (!op->name_hint_.empty()) { - stream_ << ", name_hint=\"" << op->name_hint_ << "\""; - } - stream_ << "):\n"; - IncreaseIndent(); - PrintStmtBlock(op->body_); - DecreaseIndent(); -} - -void IRPythonPrinter::VisitStmt_(const AutoInCoreScopeStmtPtr& op) { - stream_ << "with " << prefix_ << ".at(level=" << prefix_ << ".Level.CORE_GROUP, optimization="; - if (op->split_.has_value() && op->split_.value() != SplitMode::None) { - stream_ << prefix_ << ".chunked_loop_optimizer(split=" << prefix_ << ".SplitMode." - << SplitModeToPythonString(op->split_.value()) << ")"; - } else { - stream_ << prefix_ << ".chunked_loop_optimizer"; + stream_ << ", optimizations=[" << prefix_ << ".split(" << prefix_ << ".SplitMode." + << SplitModeToPythonString(op->split_.value()) << ")]"; } if (!op->name_hint_.empty()) { stream_ << ", name_hint=\"" << op->name_hint_ << "\""; diff --git a/src/ir/transforms/split_chunked_loops_pass.cpp b/src/ir/transforms/split_chunked_loops_pass.cpp deleted file mode 100644 index 074338f34..000000000 --- a/src/ir/transforms/split_chunked_loops_pass.cpp +++ /dev/null @@ -1,832 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "pypto/core/dtype.h" -#include "pypto/core/error.h" -#include "pypto/core/logging.h" -#include "pypto/ir/core.h" -#include "pypto/ir/expr.h" -#include "pypto/ir/function.h" -#include "pypto/ir/scalar_expr.h" -#include "pypto/ir/span.h" -#include "pypto/ir/stmt.h" -#include "pypto/ir/transforms/base/mutator.h" -#include "pypto/ir/transforms/pass_properties.h" -#include "pypto/ir/transforms/passes.h" -#include "pypto/ir/transforms/utils/auto_name_utils.h" -#include "pypto/ir/transforms/utils/mutable_copy.h" -#include "pypto/ir/transforms/utils/transform_utils.h" -#include "pypto/ir/type.h" - -namespace pypto { -namespace ir { - -using Attrs = std::vector>; -using transform_utils::CollectDefVars; - -namespace { - -/// Build attrs for a generated loop: copy original attrs (excluding loop_origin) and set the new origin. -Attrs MakeLoopAttrs(const Attrs& original_attrs, LoopOrigin origin) { - Attrs result; - for (const auto& [key, value] : original_attrs) { - if (key != "loop_origin") result.emplace_back(key, value); - } - result.emplace_back("loop_origin", origin); - return result; -} - -/** - * @brief Try to extract a compile-time integer from a ConstInt or Neg(ConstInt). - * @return The integer value, or std::nullopt if not a compile-time constant. - */ -static std::optional TryGetConstInt(const ExprPtr& expr) { - auto ci = std::dynamic_pointer_cast(expr); - if (ci) { - return ci->value_; - } - auto neg = std::dynamic_pointer_cast(expr); - if (neg) { - auto inner = std::dynamic_pointer_cast(neg->operand_); - if (inner) { - return -inner->value_; - } - } - return std::nullopt; -} - -/** - * @brief Extract a compile-time integer value from a ConstInt or Neg(ConstInt) expression. - */ -static int64_t GetConstIntValue(const ExprPtr& expr, const std::string& what) { - auto val = TryGetConstInt(expr); - if (val.has_value()) { - return *val; - } - throw pypto::ValueError("Chunked loop " + what + " must be a compile-time integer constant, got " + - expr->TypeName()); -} - -/** - * @brief Create a ConstInt expression with INDEX dtype. - */ -static ExprPtr MakeConstIndex(int64_t value, const Span& span) { - return std::make_shared(value, DataType::INDEX, span); -} - -/** - * @brief Compute trip count from compile-time constant bounds. - */ -static int64_t ComputeStaticTripCount(int64_t start, int64_t stop, int64_t step) { - if (step > 0 && start < stop) { - return (stop - start + step - 1) / step; - } - if (step < 0 && start > stop) { - return (start - stop + (-step) - 1) / (-step); - } - return 0; -} - -/** - * @brief Build trip count as an expression tree for dynamic bounds. - * - * Produces: max(ceildiv(stop - start, step), 0) when step > 0 - * max(ceildiv(start - stop, -step), 0) when step < 0 - */ -static ExprPtr BuildTripCountExpr(const ExprPtr& start, const ExprPtr& stop, int64_t step, const Span& sp) { - ExprPtr trip_count; - if (step > 0) { - ExprPtr range_size = MakeSub(stop, start, sp); - if (step == 1) { - trip_count = range_size; - } else { - trip_count = - MakeFloorDiv(MakeAdd(range_size, MakeConstIndex(step - 1, sp), sp), MakeConstIndex(step, sp), sp); - } - } else { - ExprPtr range_size = MakeSub(start, stop, sp); - int64_t abs_step = -step; - if (abs_step == 1) { - trip_count = range_size; - } else { - trip_count = MakeFloorDiv(MakeAdd(range_size, MakeConstIndex(abs_step - 1, sp), sp), - MakeConstIndex(abs_step, sp), sp); - } - } - return MakeMax(trip_count, MakeConstIndex(0, sp), sp); -} - -static void CollectDeclaredNames(const StmtPtr& stmt, std::unordered_set& result) { - if (!stmt) return; - - auto kind = stmt->GetKind(); - switch (kind) { - case ObjectKind::AssignStmt: { - auto assign = std::static_pointer_cast(stmt); - result.insert(assign->var_->name_hint_); - break; - } - case ObjectKind::ForStmt: { - auto for_stmt = std::static_pointer_cast(stmt); - result.insert(for_stmt->loop_var_->name_hint_); - for (const auto& ia : for_stmt->iter_args_) result.insert(ia->name_hint_); - for (const auto& rv : for_stmt->return_vars_) result.insert(rv->name_hint_); - CollectDeclaredNames(for_stmt->body_, result); - break; - } - case ObjectKind::WhileStmt: { - auto while_stmt = std::static_pointer_cast(stmt); - for (const auto& ia : while_stmt->iter_args_) result.insert(ia->name_hint_); - for (const auto& rv : while_stmt->return_vars_) result.insert(rv->name_hint_); - CollectDeclaredNames(while_stmt->body_, result); - break; - } - case ObjectKind::IfStmt: { - auto if_stmt = std::static_pointer_cast(stmt); - for (const auto& rv : if_stmt->return_vars_) result.insert(rv->name_hint_); - CollectDeclaredNames(if_stmt->then_body_, result); - if (if_stmt->else_body_.has_value()) { - CollectDeclaredNames(*if_stmt->else_body_, result); - } - break; - } - case ObjectKind::SeqStmts: { - auto seq = std::static_pointer_cast(stmt); - for (const auto& s : seq->stmts_) { - CollectDeclaredNames(s, result); - } - break; - } - case ObjectKind::InCoreScopeStmt: - case ObjectKind::AutoInCoreScopeStmt: - case ObjectKind::ClusterScopeStmt: - case ObjectKind::HierarchyScopeStmt: - case ObjectKind::SpmdScopeStmt: { - auto scope = std::static_pointer_cast(stmt); - CollectDeclaredNames(scope->body_, result); - break; - } - default: - break; - } -} - -/** - * @brief Convert a vector of statements into a single StmtPtr. - * - * Returns an empty SeqStmts for empty input, the single statement for - * size==1, or a SeqStmts wrapping multiple statements. - */ -static StmtPtr MakeResultStmt(const std::vector& stmts, const Span& span) { - return SeqStmts::Flatten(std::vector(stmts), span); -} - -/** - * @brief Mutator that splits ForStmt nodes with chunk_config_ into nested loops. - * - * Runs after SSA conversion. Propagates iter_args through generated loops. - * Handles both compile-time constant and dynamic (runtime) loop bounds. - * - * Transforms (SSA form): - * for i, (x_iter=x_0,) in range(start, stop, step, chunk=C) -> (x_rv,): - * x_1 = add(x_iter, 1.0) - * yield(x_1) - * - * Into: - * for i_out, (x_outer=x_0,) in range(0, n_full) -> (x_outer_rv,): - * for i_in, (x_inner=x_outer,) in range(0, C) -> (x_inner_rv,): - * x_1 = add(x_inner, 1.0) - * yield(x_1) - * yield(x_inner_rv) - * # optional remainder - * for i_rem, (x_rem=x_outer_rv,) in range(0, n_rem) -> (x_rem_rv,): - * x_1_f = add(x_rem, 1.0) (fresh DEF variable) - * yield(x_1_f) - * return uses x_rem_rv (or x_outer_rv if no remainder) - * - * Where n_full and n_rem are ExprPtr — either ConstInt (when bounds are - * compile-time constants) or FloorDiv/FloorMod expressions (when dynamic). - */ -class ChunkedLoopSplitter : public IRMutator { - public: - void SeedUsedNames(const FunctionPtr& func) { - function_used_names_.clear(); - for (const auto& param : func->params_) { - if (param) { - function_used_names_.insert(param->name_hint_); - } - } - CollectDeclaredNames(func->body_, function_used_names_); - } - - StmtPtr VisitStmt_(const AutoInCoreScopeStmtPtr& op) override { - bool prev = inside_auto_incore_; - inside_auto_incore_ = true; - auto new_body = VisitStmt(op->body_); - inside_auto_incore_ = prev; - if (new_body.get() == op->body_.get()) { - return op; - } - auto new_scope = MutableCopy(op); - new_scope->body_ = std::move(new_body); - return new_scope; - } - - ExprPtr VisitExpr_(const VarPtr& op) override { - auto sub_it = substitution_map_.find(op.get()); - if (sub_it != substitution_map_.end()) { - return sub_it->second; - } - return op; - } - - ExprPtr VisitExpr_(const IterArgPtr& op) override { - auto sub_it = substitution_map_.find(op.get()); - if (sub_it != substitution_map_.end()) { - return sub_it->second; - } - return IRMutator::VisitExpr_(op); - } - - StmtPtr VisitStmt_(const ForStmtPtr& op) override { - if (!op->chunk_config_.has_value() || !inside_auto_incore_) { - return IRMutator::VisitStmt_(op); - } - - // chunk_size and step must always be compile-time constants - int64_t chunk_size = GetConstIntValue(op->chunk_config_->size, "chunk_size"); - int64_t step = GetConstIntValue(op->step_, "step"); - CHECK(step != 0) << "Chunked loop step cannot be zero"; - CHECK(chunk_size > 0) << "Chunk size must be positive, got " << chunk_size; - - Span sp = op->span_; - auto step_expr = MakeConstIndex(step, sp); - auto chunk_expr = MakeConstIndex(chunk_size, sp); - - ExprPtr start_expr = VisitExpr(op->start_); - ExprPtr stop_expr = VisitExpr(op->stop_); - - const Var* loop_var_key = op->loop_var_.get(); - auto loop_name = auto_name::Parse(op->loop_var_->name_hint_); - std::string base_name = loop_name.base_name; - - auto prev_loop_sub = SaveSubstitution(loop_var_key); - std::vector prev_ia_subs; - for (const auto& ia : op->iter_args_) { - prev_ia_subs.push_back(SaveSubstitution(ia.get())); - } - - bool has_iter_args = !op->iter_args_.empty(); - ChunkPolicy policy = op->chunk_config_->policy; - - if (policy == ChunkPolicy::LeadingFull) { - // Compute n_full and n_rem as ExprPtr. - ExprPtr n_full; - ExprPtr n_rem; - auto start_c = TryGetConstInt(start_expr); - auto stop_c = TryGetConstInt(stop_expr); - if (start_c && stop_c) { - int64_t tc = ComputeStaticTripCount(*start_c, *stop_c, step); - n_full = MakeConstIndex(tc / chunk_size, sp); - n_rem = MakeConstIndex(tc % chunk_size, sp); - } else { - ExprPtr trip_count = BuildTripCountExpr(start_expr, stop_expr, step, sp); - n_full = MakeFloorDiv(trip_count, chunk_expr, sp); - n_rem = MakeFloorMod(trip_count, chunk_expr, sp); - } - - auto n_full_c = TryGetConstInt(n_full); - auto n_rem_c = TryGetConstInt(n_rem); - bool emit_full = !n_full_c || *n_full_c > 0; - bool emit_rem = !n_rem_c || *n_rem_c > 0; - - if (!has_iter_args) { - return SplitLeadingFull(op, loop_var_key, base_name, loop_name.version, start_expr, step_expr, - chunk_expr, n_full, n_rem, emit_full, emit_rem, prev_loop_sub, sp); - } - - // Zero-trip optimization: when statically known, skip loop emission entirely - if (n_full_c && n_rem_c && *n_full_c == 0 && *n_rem_c == 0) { - INTERNAL_CHECK_SPAN(op->return_vars_.size() == op->iter_args_.size(), op->span_) - << "ForStmt return_vars/iter_args size mismatch in zero-trip chunk split"; - for (size_t i = 0; i < op->return_vars_.size(); ++i) { - substitution_map_[op->return_vars_[i].get()] = VisitExpr(op->iter_args_[i]->initValue_); - } - RestoreSubstitution(prev_loop_sub); - RestoreSubstitutions(prev_ia_subs); - return SeqStmts::Flatten(std::vector{}, sp); - } - - return SplitLeadingFullWithIterArgs(op, loop_var_key, base_name, loop_name.version, start_expr, - step_expr, chunk_expr, n_full, n_rem, emit_full, emit_rem, - prev_loop_sub, prev_ia_subs, sp); - } - - INTERNAL_CHECK_SPAN(policy == ChunkPolicy::Guarded, op->span_) - << "Unexpected ChunkPolicy in SplitChunkedLoops: " << ChunkPolicyToString(policy); - - // Compute n_total = ceil(trip_count / chunk_size). - ExprPtr n_total; - auto start_c = TryGetConstInt(start_expr); - auto stop_c = TryGetConstInt(stop_expr); - if (start_c && stop_c) { - int64_t tc = ComputeStaticTripCount(*start_c, *stop_c, step); - int64_t nt = (tc + chunk_size - 1) / chunk_size; - n_total = MakeConstIndex(nt, sp); - } else { - ExprPtr trip_count = BuildTripCountExpr(start_expr, stop_expr, step, sp); - ExprPtr numerator = MakeAdd(trip_count, MakeConstIndex(chunk_size - 1, sp), sp); - n_total = MakeFloorDiv(numerator, chunk_expr, sp); - } - - auto n_total_c = TryGetConstInt(n_total); - bool emit = !n_total_c || *n_total_c > 0; - - if (!emit) { - // Statically zero iterations: emit nothing and forward iter_arg initial values. - if (has_iter_args) { - INTERNAL_CHECK_SPAN(op->return_vars_.size() == op->iter_args_.size(), op->span_) - << "ForStmt return_vars/iter_args size mismatch in zero-trip guarded chunk split"; - for (size_t i = 0; i < op->return_vars_.size(); ++i) { - substitution_map_[op->return_vars_[i].get()] = VisitExpr(op->iter_args_[i]->initValue_); - } - } - RestoreSubstitution(prev_loop_sub); - RestoreSubstitutions(prev_ia_subs); - return SeqStmts::Flatten(std::vector{}, sp); - } - - if (!has_iter_args) { - return SplitGuarded(op, loop_var_key, base_name, loop_name.version, start_expr, step_expr, step, - chunk_expr, stop_expr, n_total, prev_loop_sub, sp); - } - return SplitGuardedWithIterArgs(op, loop_var_key, base_name, loop_name.version, start_expr, step_expr, - step, chunk_expr, stop_expr, n_total, prev_loop_sub, prev_ia_subs, sp); - } - - StmtPtr VisitStmt_(const SeqStmtsPtr& op) override { - std::vector new_stmts; - bool changed = false; - - for (const auto& stmt : op->stmts_) { - auto new_stmt = VisitStmt(stmt); - if (new_stmt.get() != stmt.get()) { - changed = true; - } - // Flatten nested SeqStmts - auto seq = std::dynamic_pointer_cast(new_stmt); - if (seq) { - for (const auto& inner : seq->stmts_) { - new_stmts.push_back(inner); - } - } else { - new_stmts.push_back(new_stmt); - } - } - - if (!changed) { - return op; - } - return SeqStmts::Flatten(std::move(new_stmts), op->span_); - } - - private: - bool inside_auto_incore_ = false; - std::unordered_set function_used_names_; - std::unordered_map substitution_map_; - - using SavedSubstitution = std::pair; - - SavedSubstitution SaveSubstitution(const Var* key) { - auto it = substitution_map_.find(key); - return {key, (it != substitution_map_.end()) ? it->second : nullptr}; - } - - void RestoreSubstitution(const SavedSubstitution& saved) { - if (saved.second) { - substitution_map_[saved.first] = saved.second; - } else { - substitution_map_.erase(saved.first); - } - } - - void RestoreSubstitutions(const std::vector& saved) { - for (const auto& entry : saved) { - RestoreSubstitution(entry); - } - } - - /** - * @brief Freshen all DEF vars in the body to preserve SSA uniqueness. - * - * Used when the body is visited more than once (e.g. full-chunk + remainder). - * Returns saved substitutions that must be restored after visiting the body. - */ - std::vector FreshenBodyDefVars(const StmtPtr& body) { - std::vector prev_def_subs; - std::vector body_def_vars; - CollectDefVars(body, body_def_vars); - for (const auto& var : body_def_vars) { - prev_def_subs.push_back(SaveSubstitution(var.get())); - auto fresh_name = auto_name::GenerateFreshNameLike(var->name_hint_, function_used_names_); - function_used_names_.insert(fresh_name); - auto fresh = std::make_shared(fresh_name, var->GetType(), var->span_); - substitution_map_[var.get()] = fresh; - } - return prev_def_subs; - } - - /** - * @brief Split a chunked loop without iter_args. - * - * n_full and n_rem are ExprPtr — either ConstInt or dynamic expressions. - */ - StmtPtr SplitLeadingFull(const ForStmtPtr& op, const Var* loop_var_key, const std::string& base_name, - const std::optional& loop_version, const ExprPtr& start_expr, - const ExprPtr& step_expr, const ExprPtr& chunk_expr, const ExprPtr& n_full, - const ExprPtr& n_rem, bool emit_full, bool emit_rem, - const SavedSubstitution& prev_loop_sub, const Span& sp) { - auto zero = MakeConstIndex(0, sp); - auto one = MakeConstIndex(1, sp); - std::vector result_stmts; - - if (emit_full) { - auto out_var = std::make_shared( - auto_name::BuildName(base_name, auto_name::ChunkOuterQualifier(), "idx", loop_version), - std::make_shared(DataType::INDEX), sp); - auto in_var = std::make_shared( - auto_name::BuildName(base_name, auto_name::ChunkInnerQualifier(), "idx", loop_version), - std::make_shared(DataType::INDEX), sp); - - // i = start + (i_out * C + i_in) * step - substitution_map_[loop_var_key] = - MakeAdd(start_expr, MakeMul(MakeAdd(MakeMul(out_var, chunk_expr), in_var), step_expr)); - auto inner_body = VisitStmt(op->body_); - - auto inner_for = std::make_shared( - in_var, zero, chunk_expr, one, std::vector{}, inner_body, std::vector{}, sp, - op->kind_, std::nullopt, MakeLoopAttrs(op->attrs_, LoopOrigin::ChunkInner)); - auto outer_for = std::make_shared( - out_var, zero, n_full, one, std::vector{}, inner_for, std::vector{}, sp, - op->kind_, std::nullopt, MakeLoopAttrs(op->attrs_, LoopOrigin::ChunkOuter)); - result_stmts.push_back(outer_for); - } - - if (emit_rem) { - auto rem_var = std::make_shared( - auto_name::BuildName(base_name, auto_name::ChunkRemainderQualifier(), "idx", loop_version), - std::make_shared(DataType::INDEX), sp); - - // i = start + (n_full * C + i_rem) * step - substitution_map_[loop_var_key] = - MakeAdd(start_expr, MakeMul(MakeAdd(MakeMul(n_full, chunk_expr), rem_var), step_expr)); - - std::vector prev_def_subs; - if (emit_full) { - prev_def_subs = FreshenBodyDefVars(op->body_); - } - auto rem_body = VisitStmt(op->body_); - RestoreSubstitutions(prev_def_subs); - - auto rem_for = std::make_shared(rem_var, zero, n_rem, one, std::vector{}, rem_body, - std::vector{}, sp, op->kind_, std::nullopt, - MakeLoopAttrs(op->attrs_, LoopOrigin::ChunkRemainder)); - result_stmts.push_back(rem_for); - } - - RestoreSubstitution(prev_loop_sub); - return MakeResultStmt(result_stmts, sp); - } - - /** - * @brief Split a chunked loop with iter_args (SSA propagation). - * - * n_full and n_rem are ExprPtr — either ConstInt or dynamic expressions. - */ - StmtPtr SplitLeadingFullWithIterArgs(const ForStmtPtr& op, const Var* loop_var_key, - const std::string& base_name, const std::optional& loop_version, - const ExprPtr& start_expr, const ExprPtr& step_expr, - const ExprPtr& chunk_expr, const ExprPtr& n_full, const ExprPtr& n_rem, - bool emit_full, bool emit_rem, const SavedSubstitution& prev_loop_sub, - const std::vector& prev_ia_subs, const Span& sp) { - auto zero = MakeConstIndex(0, sp); - auto one = MakeConstIndex(1, sp); - std::vector result_stmts; - std::vector final_return_vars; - - if (emit_full) { - auto out_var = std::make_shared( - auto_name::BuildName(base_name, auto_name::ChunkOuterQualifier(), "idx", loop_version), - std::make_shared(DataType::INDEX), sp); - auto in_var = std::make_shared( - auto_name::BuildName(base_name, auto_name::ChunkInnerQualifier(), "idx", loop_version), - std::make_shared(DataType::INDEX), sp); - - std::vector outer_iter_args; - std::vector outer_return_vars; - std::vector inner_iter_args; - std::vector inner_return_vars; - - for (const auto& ia : op->iter_args_) { - auto visited_init = VisitExpr(ia->initValue_); - auto ia_name = auto_name::Parse(ia->name_hint_); - auto outer_ia = std::make_shared( - auto_name::BuildName(ia_name.base_name, auto_name::ChunkOuterQualifier(), "iter", - ia_name.version), - ia->GetType(), visited_init, ia->span_); - auto outer_rv = std::make_shared( - auto_name::BuildName(ia_name.base_name, auto_name::ChunkOuterQualifier(), "rv", ia_name.version), - ia->GetType(), ia->span_); - outer_iter_args.push_back(outer_ia); - outer_return_vars.push_back(outer_rv); - - auto inner_ia = std::make_shared( - auto_name::BuildName(ia_name.base_name, auto_name::ChunkInnerQualifier(), "iter", - ia_name.version), - ia->GetType(), ExprPtr(outer_ia), ia->span_); - auto inner_rv = std::make_shared( - auto_name::BuildName(ia_name.base_name, auto_name::ChunkInnerQualifier(), "rv", ia_name.version), - ia->GetType(), ia->span_); - inner_iter_args.push_back(inner_ia); - inner_return_vars.push_back(inner_rv); - - substitution_map_[ia.get()] = inner_ia; - } - - // i = start + (i_out * C + i_in) * step - substitution_map_[loop_var_key] = - MakeAdd(start_expr, MakeMul(MakeAdd(MakeMul(out_var, chunk_expr), in_var), step_expr)); - auto inner_body = VisitStmt(op->body_); - - auto inner_for = std::make_shared(in_var, zero, chunk_expr, one, inner_iter_args, inner_body, - inner_return_vars, sp, op->kind_, std::nullopt, - MakeLoopAttrs(op->attrs_, LoopOrigin::ChunkInner)); - auto outer_yield = std::make_shared( - std::vector(inner_return_vars.begin(), inner_return_vars.end()), sp); - auto outer_body = SeqStmts::Flatten(std::vector{inner_for, outer_yield}, sp); - - auto outer_for = std::make_shared(out_var, zero, n_full, one, outer_iter_args, outer_body, - outer_return_vars, sp, op->kind_, std::nullopt, - MakeLoopAttrs(op->attrs_, LoopOrigin::ChunkOuter)); - - result_stmts.push_back(outer_for); - final_return_vars = outer_return_vars; - } - - if (emit_rem) { - auto rem_var = std::make_shared( - auto_name::BuildName(base_name, auto_name::ChunkRemainderQualifier(), "idx", loop_version), - std::make_shared(DataType::INDEX), sp); - - std::vector rem_iter_args; - std::vector rem_return_vars; - - for (size_t i = 0; i < op->iter_args_.size(); ++i) { - const auto& ia = op->iter_args_[i]; - ExprPtr rem_init = emit_full ? ExprPtr(final_return_vars[i]) : VisitExpr(ia->initValue_); - auto ia_name = auto_name::Parse(ia->name_hint_); - auto rem_ia = std::make_shared( - auto_name::BuildName(ia_name.base_name, auto_name::ChunkRemainderQualifier(), "iter", - ia_name.version), - ia->GetType(), rem_init, ia->span_); - auto rem_rv = std::make_shared( - auto_name::BuildName(ia_name.base_name, auto_name::ChunkRemainderQualifier(), "rv", - ia_name.version), - ia->GetType(), ia->span_); - rem_iter_args.push_back(rem_ia); - rem_return_vars.push_back(rem_rv); - - substitution_map_[ia.get()] = rem_ia; - } - - // i = start + (n_full * C + i_rem) * step - substitution_map_[loop_var_key] = - MakeAdd(start_expr, MakeMul(MakeAdd(MakeMul(n_full, chunk_expr), rem_var), step_expr)); - - std::vector prev_def_subs; - if (emit_full) { - prev_def_subs = FreshenBodyDefVars(op->body_); - } - auto rem_body = VisitStmt(op->body_); - RestoreSubstitutions(prev_def_subs); - - auto rem_for = std::make_shared(rem_var, zero, n_rem, one, rem_iter_args, rem_body, - rem_return_vars, sp, op->kind_, std::nullopt, - MakeLoopAttrs(op->attrs_, LoopOrigin::ChunkRemainder)); - - result_stmts.push_back(rem_for); - final_return_vars = rem_return_vars; - } - - INTERNAL_CHECK_SPAN(op->return_vars_.size() == final_return_vars.size(), op->span_) - << "SplitChunkedLoops produced mismatched return vars"; - for (size_t i = 0; i < op->return_vars_.size(); ++i) { - substitution_map_[op->return_vars_[i].get()] = final_return_vars[i]; - } - - RestoreSubstitution(prev_loop_sub); - RestoreSubstitutions(prev_ia_subs); - - return MakeResultStmt(result_stmts, sp); - } - - /** - * @brief Guarded split without iter_args. - * - * Emits a single outer loop over ceil(trip_count / C) chunks and an inner loop - * of size C, with the body wrapped in `if (idx < stop)` so out-of-range - * iterations become no-ops. This preserves a single-kernel outline for dynamic - * bounds and loops with cross-iteration state. - */ - StmtPtr SplitGuarded(const ForStmtPtr& op, const Var* loop_var_key, const std::string& base_name, - const std::optional& loop_version, const ExprPtr& start_expr, - const ExprPtr& step_expr, int64_t step, const ExprPtr& chunk_expr, - const ExprPtr& stop_expr, const ExprPtr& n_total, - const SavedSubstitution& prev_loop_sub, const Span& sp) { - auto zero = MakeConstIndex(0, sp); - auto one = MakeConstIndex(1, sp); - - auto out_var = std::make_shared( - auto_name::BuildName(base_name, auto_name::ChunkOuterQualifier(), "idx", loop_version), - std::make_shared(DataType::INDEX), sp); - auto in_var = std::make_shared( - auto_name::BuildName(base_name, auto_name::ChunkInnerQualifier(), "idx", loop_version), - std::make_shared(DataType::INDEX), sp); - - // idx = start + (out_var * C + in_var) * step - ExprPtr idx_expr = MakeAdd( - start_expr, MakeMul(MakeAdd(MakeMul(out_var, chunk_expr, sp), in_var, sp), step_expr, sp), sp); - substitution_map_[loop_var_key] = idx_expr; - auto visited_body = VisitStmt(op->body_); - - // Guard: for step > 0 use `idx < stop`, for step < 0 use `idx > stop`. - auto cond = step > 0 ? MakeLt(idx_expr, stop_expr, sp) : MakeGt(idx_expr, stop_expr, sp); - auto if_stmt = - std::make_shared(cond, visited_body, std::optional{}, std::vector{}, sp); - - auto inner_for = std::make_shared(in_var, zero, chunk_expr, one, std::vector{}, - if_stmt, std::vector{}, sp, op->kind_, std::nullopt, - MakeLoopAttrs(op->attrs_, LoopOrigin::ChunkInner)); - auto outer_for = std::make_shared(out_var, zero, n_total, one, std::vector{}, - inner_for, std::vector{}, sp, op->kind_, std::nullopt, - MakeLoopAttrs(op->attrs_, LoopOrigin::ChunkOuter)); - - RestoreSubstitution(prev_loop_sub); - return outer_for; - } - - /** - * @brief Guarded split with iter_args (SSA propagation through IfStmt phi). - * - * Wraps the body in an IfStmt whose return_vars act as phi nodes. The then - * branch ends with the user body's own YieldStmt; the else branch yields the - * unchanged inner iter_args. The inner loop's trailing YieldStmt references - * the IfStmt's phi return_vars, threading loop-carried state through both - * guarded and skipped iterations. - */ - StmtPtr SplitGuardedWithIterArgs(const ForStmtPtr& op, const Var* loop_var_key, - const std::string& base_name, const std::optional& loop_version, - const ExprPtr& start_expr, const ExprPtr& step_expr, int64_t step, - const ExprPtr& chunk_expr, const ExprPtr& stop_expr, - const ExprPtr& n_total, const SavedSubstitution& prev_loop_sub, - const std::vector& prev_ia_subs, const Span& sp) { - auto zero = MakeConstIndex(0, sp); - auto one = MakeConstIndex(1, sp); - - auto out_var = std::make_shared( - auto_name::BuildName(base_name, auto_name::ChunkOuterQualifier(), "idx", loop_version), - std::make_shared(DataType::INDEX), sp); - auto in_var = std::make_shared( - auto_name::BuildName(base_name, auto_name::ChunkInnerQualifier(), "idx", loop_version), - std::make_shared(DataType::INDEX), sp); - - std::vector outer_iter_args; - std::vector outer_return_vars; - std::vector inner_iter_args; - std::vector inner_return_vars; - std::vector if_return_vars; - - for (const auto& ia : op->iter_args_) { - auto visited_init = VisitExpr(ia->initValue_); - auto ia_name = auto_name::Parse(ia->name_hint_); - auto outer_ia = std::make_shared( - auto_name::BuildName(ia_name.base_name, auto_name::ChunkOuterQualifier(), "iter", ia_name.version), - ia->GetType(), visited_init, ia->span_); - auto outer_rv = std::make_shared( - auto_name::BuildName(ia_name.base_name, auto_name::ChunkOuterQualifier(), "rv", ia_name.version), - ia->GetType(), ia->span_); - outer_iter_args.push_back(outer_ia); - outer_return_vars.push_back(outer_rv); - - auto inner_ia = std::make_shared( - auto_name::BuildName(ia_name.base_name, auto_name::ChunkInnerQualifier(), "iter", ia_name.version), - ia->GetType(), ExprPtr(outer_ia), ia->span_); - auto inner_rv = std::make_shared( - auto_name::BuildName(ia_name.base_name, auto_name::ChunkInnerQualifier(), "rv", ia_name.version), - ia->GetType(), ia->span_); - inner_iter_args.push_back(inner_ia); - inner_return_vars.push_back(inner_rv); - - auto if_rv = std::make_shared( - auto_name::BuildName(ia_name.base_name, auto_name::ChunkGuardQualifier(), "rv", ia_name.version), - ia->GetType(), ia->span_); - if_return_vars.push_back(if_rv); - - substitution_map_[ia.get()] = inner_ia; - } - - // idx = start + (out_var * C + in_var) * step - ExprPtr idx_expr = MakeAdd( - start_expr, MakeMul(MakeAdd(MakeMul(out_var, chunk_expr, sp), in_var, sp), step_expr, sp), sp); - substitution_map_[loop_var_key] = idx_expr; - auto visited_body = VisitStmt(op->body_); - - // Else branch: pass through current inner iter_args unchanged. - std::vector else_yield_values(inner_iter_args.begin(), inner_iter_args.end()); - auto else_yield = std::make_shared(std::move(else_yield_values), sp); - - // Guarded IfStmt with phi return_vars. - // For step > 0 use `idx < stop`, for step < 0 use `idx > stop`. - auto cond = step > 0 ? MakeLt(idx_expr, stop_expr, sp) : MakeGt(idx_expr, stop_expr, sp); - auto if_stmt = - std::make_shared(cond, visited_body, std::optional{else_yield}, if_return_vars, sp); - - // Inner loop body: SeqStmts { IfStmt, YieldStmt(if_return_vars) } - auto inner_trailing_yield = - std::make_shared(std::vector(if_return_vars.begin(), if_return_vars.end()), sp); - auto inner_body = SeqStmts::Flatten(std::vector{if_stmt, inner_trailing_yield}, sp); - - auto inner_for = std::make_shared(in_var, zero, chunk_expr, one, inner_iter_args, inner_body, - inner_return_vars, sp, op->kind_, std::nullopt, - MakeLoopAttrs(op->attrs_, LoopOrigin::ChunkInner)); - - // Outer loop body: SeqStmts { inner_for, YieldStmt(inner_return_vars) } - auto outer_yield = std::make_shared( - std::vector(inner_return_vars.begin(), inner_return_vars.end()), sp); - auto outer_body = SeqStmts::Flatten(std::vector{inner_for, outer_yield}, sp); - - auto outer_for = std::make_shared(out_var, zero, n_total, one, outer_iter_args, outer_body, - outer_return_vars, sp, op->kind_, std::nullopt, - MakeLoopAttrs(op->attrs_, LoopOrigin::ChunkOuter)); - - INTERNAL_CHECK_SPAN(op->return_vars_.size() == outer_return_vars.size(), op->span_) - << "SplitChunkedLoops guarded produced mismatched return vars"; - for (size_t i = 0; i < op->return_vars_.size(); ++i) { - substitution_map_[op->return_vars_[i].get()] = outer_return_vars[i]; - } - - RestoreSubstitution(prev_loop_sub); - RestoreSubstitutions(prev_ia_subs); - return outer_for; - } -}; - -/** - * @brief Transform a function by splitting chunked loops. - */ -FunctionPtr TransformSplitChunkedLoops(const FunctionPtr& func) { - INTERNAL_CHECK(func) << "SplitChunkedLoops cannot run on null function"; - - ChunkedLoopSplitter splitter; - splitter.SeedUsedNames(func); - auto new_body = splitter.VisitStmt(func->body_); - - if (new_body.get() == func->body_.get()) { - return func; - } - - auto new_func = MutableCopy(func); - new_func->body_ = std::move(new_body); - return new_func; -} - -} // namespace - -// Factory function -namespace pass { -Pass SplitChunkedLoops() { - return CreateFunctionPass(TransformSplitChunkedLoops, "SplitChunkedLoops", kSplitChunkedLoopsProperties); -} -} // namespace pass - -} // namespace ir -} // namespace pypto diff --git a/src/ir/transforms/structural_equal.cpp b/src/ir/transforms/structural_equal.cpp index 612045582..bc2aa3ada 100644 --- a/src/ir/transforms/structural_equal.cpp +++ b/src/ir/transforms/structural_equal.cpp @@ -876,8 +876,6 @@ bool StructuralEqualImpl::Equal(const IRNodePtr& lhs, const IRNodePt EQUAL_DISPATCH(ReturnStmt) EQUAL_DISPATCH(ForStmt) EQUAL_DISPATCH(WhileStmt) - EQUAL_DISPATCH(InCoreScopeStmt) - EQUAL_DISPATCH(AutoInCoreScopeStmt) EQUAL_DISPATCH(ClusterScopeStmt) EQUAL_DISPATCH(HierarchyScopeStmt) EQUAL_DISPATCH(SpmdScopeStmt) diff --git a/src/ir/transforms/structural_hash.cpp b/src/ir/transforms/structural_hash.cpp index 7ef4e05bc..22d20ef32 100644 --- a/src/ir/transforms/structural_hash.cpp +++ b/src/ir/transforms/structural_hash.cpp @@ -552,8 +552,6 @@ StructuralHasher::result_type StructuralHasher::HashNode(const IRNodePtr& node) HASH_DISPATCH(ReturnStmt) HASH_DISPATCH(ForStmt) HASH_DISPATCH(WhileStmt) - HASH_DISPATCH(InCoreScopeStmt) - HASH_DISPATCH(AutoInCoreScopeStmt) HASH_DISPATCH(ClusterScopeStmt) HASH_DISPATCH(HierarchyScopeStmt) HASH_DISPATCH(SpmdScopeStmt) diff --git a/src/ir/transforms/utils/transform_utils.cpp b/src/ir/transforms/utils/transform_utils.cpp index 762bc18b8..0684b044d 100644 --- a/src/ir/transforms/utils/transform_utils.cpp +++ b/src/ir/transforms/utils/transform_utils.cpp @@ -255,8 +255,6 @@ void CollectDefVars(const StmtPtr& stmt, std::vector& result) { } break; } - case ObjectKind::InCoreScopeStmt: - case ObjectKind::AutoInCoreScopeStmt: case ObjectKind::ClusterScopeStmt: case ObjectKind::HierarchyScopeStmt: case ObjectKind::SpmdScopeStmt: { diff --git a/src/ir/transforms/visitor.cpp b/src/ir/transforms/visitor.cpp index 19752b253..87843057e 100644 --- a/src/ir/transforms/visitor.cpp +++ b/src/ir/transforms/visitor.cpp @@ -213,16 +213,6 @@ void IRVisitor::VisitStmt_(const WhileStmtPtr& op) { } } -void IRVisitor::VisitStmt_(const InCoreScopeStmtPtr& op) { - INTERNAL_CHECK_SPAN(op->body_, op->span_) << "InCoreScopeStmt has null body"; - VisitStmt(op->body_); -} - -void IRVisitor::VisitStmt_(const AutoInCoreScopeStmtPtr& op) { - INTERNAL_CHECK_SPAN(op->body_, op->span_) << "AutoInCoreScopeStmt has null body"; - VisitStmt(op->body_); -} - void IRVisitor::VisitStmt_(const ClusterScopeStmtPtr& op) { INTERNAL_CHECK_SPAN(op->body_, op->span_) << "ClusterScopeStmt has null body"; VisitStmt(op->body_); diff --git a/src/ir/verifier/property_verifier_registry.cpp b/src/ir/verifier/property_verifier_registry.cpp index 6553e104d..606787fd4 100644 --- a/src/ir/verifier/property_verifier_registry.cpp +++ b/src/ir/verifier/property_verifier_registry.cpp @@ -44,7 +44,6 @@ PropertyVerifierRegistry::PropertyVerifierRegistry() { Register(IRProperty::NoNestedCalls, CreateNoNestedCallPropertyVerifier); Register(IRProperty::NormalizedStmtStructure, CreateNormalizedStmtPropertyVerifier); Register(IRProperty::NoRedundantBlocks, CreateNoRedundantBlocksPropertyVerifier); - Register(IRProperty::SplitIncoreOrch, CreateSplitIncoreOrchPropertyVerifier); Register(IRProperty::ClusterOutlined, CreateClusterOutlinedPropertyVerifier); Register(IRProperty::HierarchyOutlined, CreateHierarchyOutlinedPropertyVerifier); Register(IRProperty::HasMemRefs, CreateHasMemRefsPropertyVerifier); @@ -57,7 +56,6 @@ PropertyVerifierRegistry::PropertyVerifierRegistry() { Register(IRProperty::UseAfterDef, CreateUseAfterDefPropertyVerifier); Register(IRProperty::StructuredCtrlFlow, CreateStructuredCtrlFlowPropertyVerifier); Register(IRProperty::OutParamNotShadowed, CreateOutParamNotShadowedPropertyVerifier); - Register(IRProperty::NoNestedInCore, CreateNoNestedIncorePropertyVerifier); Register(IRProperty::InOutUseValid, CreateInOutUseValidPropertyVerifier); } diff --git a/tests/ut/codegen/test_orchestration_codegen.py b/tests/ut/codegen/test_orchestration_codegen.py index e724fdc94..7778162ba 100644 --- a/tests/ut/codegen/test_orchestration_codegen.py +++ b/tests/ut/codegen/test_orchestration_codegen.py @@ -1152,52 +1152,6 @@ def orch( assert "acc__loop_state" not in code assert "params_t1.add_input(acc);" in code - def test_for_loop_with_inplace_return_after_passes(self): - """Test inplace detection when return var has compound auto-name suffixes from pass pipeline. - - When an Opaque function with auto_incore + parallel(chunk=) goes through the full - pass pipeline (SSA → split_chunked_loops → interchange_chunk_loops → outline), the - return var acquires compound suffixes like "__co_l0_rv_v1". GetSSABaseName must - strip all of these to match the return var back to the original param name for correct - inplace detection (2 arg slots, not 3). - """ - backend.reset_for_testing() - backend.set_backend_type(BackendType.Ascend910B) - - @pl.program - class ChunkedInplaceProgram: - @pl.function(type=pl.FunctionType.Opaque) - def add_one( - self, - input_tensor: pl.Tensor[[1024, 256], pl.FP32], - output_tensor: pl.Tensor[[1024, 256], pl.FP32], - ) -> pl.Tensor[[1024, 256], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for r in pl.parallel(0, 1024, 1, chunk=64, chunk_policy="leading_full"): - row_tile = pl.slice(input_tensor, [1, 256], [r, 0]) - row_result = pl.add(row_tile, 1.0) - output_tensor = pl.assemble(output_tensor, row_result, [r, 0]) - return output_tensor - - # Run the full pass pipeline to produce compound SSA suffixes - pm = PassManager.get_strategy(OptimizationStrategy.Default) - transformed = pm.run_passes(ChunkedInplaceProgram) - - code = _generate_orch_code(transformed) - - # Inplace detection: output_tensor return var should match the param, - # so only 2 orch arg slots (input_tensor + output_tensor), not 3 - assert "expected_arg_count = 2" in code - assert "from_tensor_arg(orch_args.tensor(0))" in code # input_tensor - assert "from_tensor_arg(orch_args.tensor(1))" in code # output_tensor - - # No third orch entry for the compound-named return var - assert "orch_args.tensor(2)" not in code - - # Task params should use ext_output_tensor (the inplace param), not a separate buffer - assert "ext_output_tensor)" in code - assert "ext_output_tensor_iter" not in code - def test_tensor_assemble_uses_precomputed_view(self): """tensor.assemble should lower to a pre-generated target view, not a host copy.""" diff --git a/tests/ut/codegen/test_pto_codegen_cross_core.py b/tests/ut/codegen/test_pto_codegen_cross_core.py index 05a539f5b..976f4e960 100644 --- a/tests/ut/codegen/test_pto_codegen_cross_core.py +++ b/tests/ut/codegen/test_pto_codegen_cross_core.py @@ -217,9 +217,7 @@ def _compile_and_generate(program) -> dict[str, str]: passes.unroll_loops, passes.convert_to_ssa, passes.flatten_call_expr, - passes.split_chunked_loops, - passes.interchange_chunk_loops, - passes.outline_incore_scopes, + passes.outline_hierarchy_scopes, passes.outline_cluster_scopes, passes.convert_tensor_to_tile_ops, passes.flatten_tile_nd_to_2d, @@ -511,9 +509,7 @@ def _expand_and_generate(program) -> dict[str, str]: pipeline.add_pass(passes.unroll_loops()) pipeline.add_pass(passes.convert_to_ssa()) pipeline.add_pass(passes.flatten_call_expr()) - pipeline.add_pass(passes.split_chunked_loops()) - pipeline.add_pass(passes.interchange_chunk_loops()) - pipeline.add_pass(passes.outline_incore_scopes()) + pipeline.add_pass(passes.outline_hierarchy_scopes()) pipeline.add_pass(passes.outline_cluster_scopes()) pipeline.add_pass(passes.convert_tensor_to_tile_ops()) pipeline.add_pass(passes.flatten_tile_nd_to_2d()) diff --git a/tests/ut/debug/test_torch_codegen.py b/tests/ut/debug/test_torch_codegen.py index 9aec50c90..dfc3d587d 100644 --- a/tests/ut/debug/test_torch_codegen.py +++ b/tests/ut/debug/test_torch_codegen.py @@ -429,7 +429,7 @@ def test_scope_is_transparent(): b = _tensor_var("b", [64]) call = _op_call("tensor.neg", [a]) assign = ir.AssignStmt(b, call, _span()) - scope = ir.InCoreScopeStmt(body=assign, span=_span()) + scope = ir.HierarchyScopeStmt(level=ir.Level.CORE_GROUP, body=assign, span=_span()) func = _simple_function("f", [a], scope) code = torch_codegen(func) assert "torch.neg(a)" in code diff --git a/tests/ut/ir/parser/test_at_optimizations.py b/tests/ut/ir/parser/test_at_optimizations.py index de5fe3234..e42389a0a 100644 --- a/tests/ut/ir/parser/test_at_optimizations.py +++ b/tests/ut/ir/parser/test_at_optimizations.py @@ -9,13 +9,11 @@ """Tests for pl.at(..., optimizations=[...]) parsing. -Covers issue #1030: the optimizations= list lets users express ``pl.split(...)`` -and ``pl.auto_chunk`` independently. The legacy ``optimization=`` and top-level -``split=`` kwargs remain functional but emit DeprecationWarning, and mixing the -new ``optimizations=`` with either deprecated kwarg is a hard error. +After the removal of InCoreScopeStmt / AutoInCoreScopeStmt, ``pl.at(...)`` always +produces a HierarchyScopeStmt. At ``Level.CORE_GROUP``, the ``optimizations=`` +list accepts ``pl.split(mode)`` to populate the scope's ``split`` field. """ -import warnings from typing import TypeVar import pypto.language as pl @@ -38,11 +36,11 @@ def _find_scope(stmt, scope_type: type[T]) -> T | None: return None -# ─── New API: optimizations=[pl.split(...)] → InCore with split ────────────── +# ─── optimizations=[pl.split(...)] → HierarchyScopeStmt with split ─────────── def test_parse_optimizations_split_only_up_down(): - """optimizations=[pl.split(UP_DOWN)] → InCore with split=UP_DOWN.""" + """optimizations=[pl.split(UP_DOWN)] → HierarchyScopeStmt with split=UP_DOWN.""" @pl.function def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: @@ -50,13 +48,14 @@ def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: y = pl.add(x, x) return y - scope = _find_scope(f.body, ir.InCoreScopeStmt) + scope = _find_scope(f.body, ir.HierarchyScopeStmt) assert scope is not None + assert scope.level == ir.Level.CORE_GROUP assert scope.split == ir.SplitMode.UP_DOWN def test_parse_optimizations_split_only_left_right(): - """optimizations=[pl.split(LEFT_RIGHT)] → InCore with split=LEFT_RIGHT.""" + """optimizations=[pl.split(LEFT_RIGHT)] → HierarchyScopeStmt with split=LEFT_RIGHT.""" @pl.function def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: @@ -64,81 +63,13 @@ def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: y = pl.add(x, x) return y - scope = _find_scope(f.body, ir.InCoreScopeStmt) + scope = _find_scope(f.body, ir.HierarchyScopeStmt) assert scope is not None assert scope.split == ir.SplitMode.LEFT_RIGHT -# ─── New API: optimizations=[pl.auto_chunk] → AutoInCore (no split) ────────── - - -def test_parse_optimizations_auto_chunk_only(): - """optimizations=[pl.auto_chunk] → AutoInCore with no split.""" - - @pl.function - def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.auto_chunk]): - for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, x) - return x - - scope = _find_scope(f.body, ir.AutoInCoreScopeStmt) - assert scope is not None - assert scope.split is None - - -# ─── New API: optimizations=[pl.auto_chunk, pl.split(...)] → AutoInCore + split - - -def test_parse_optimizations_auto_chunk_with_split(): - """optimizations=[pl.auto_chunk, pl.split(UP_DOWN)] → AutoInCore with split.""" - - @pl.function - def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at( - level=pl.Level.CORE_GROUP, - optimizations=[pl.auto_chunk, pl.split(pl.SplitMode.UP_DOWN)], - ): - for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, x) - return x - - scope = _find_scope(f.body, ir.AutoInCoreScopeStmt) - assert scope is not None - assert scope.split == ir.SplitMode.UP_DOWN - - -def test_parse_optimizations_order_independent(): - """List order does not affect the produced IR.""" - - @pl.function - def f1(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at( - level=pl.Level.CORE_GROUP, - optimizations=[pl.auto_chunk, pl.split(pl.SplitMode.LEFT_RIGHT)], - ): - for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, x) - return x - - @pl.function - def f2(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at( - level=pl.Level.CORE_GROUP, - optimizations=[pl.split(pl.SplitMode.LEFT_RIGHT), pl.auto_chunk], - ): - for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, x) - return x - - s1 = _find_scope(f1.body, ir.AutoInCoreScopeStmt) - s2 = _find_scope(f2.body, ir.AutoInCoreScopeStmt) - assert s1 is not None and s2 is not None - assert s1.split == s2.split == ir.SplitMode.LEFT_RIGHT - - -def test_parse_optimizations_empty_list_is_plain_incore(): - """optimizations=[] → InCore with no split.""" +def test_parse_optimizations_empty_list_is_plain_hierarchy(): + """optimizations=[] → HierarchyScopeStmt with no split.""" @pl.function def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: @@ -146,138 +77,24 @@ def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: y = pl.add(x, x) return y - scope = _find_scope(f.body, ir.InCoreScopeStmt) + scope = _find_scope(f.body, ir.HierarchyScopeStmt) assert scope is not None assert scope.split is None -# ─── Equivalence with deprecated API ────────────────────────────────────────── - - -def test_legacy_chunked_loop_optimizer_matches_new_form(): - """Legacy bare optimizer (defaults to UP_DOWN) ≡ new auto_chunk + split(UP_DOWN).""" - - with warnings.catch_warnings(): - warnings.simplefilter("ignore", DeprecationWarning) - - @pl.function - def legacy(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, x) - return x +def test_parse_core_group_no_optimizations(): + """pl.at(level=CORE_GROUP) without optimizations → plain HierarchyScopeStmt.""" @pl.function - def new(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at( - level=pl.Level.CORE_GROUP, - optimizations=[pl.auto_chunk, pl.split(pl.SplitMode.UP_DOWN)], - ): - for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, x) - return x - - s_legacy = _find_scope(legacy.body, ir.AutoInCoreScopeStmt) - s_new = _find_scope(new.body, ir.AutoInCoreScopeStmt) - assert s_legacy is not None and s_new is not None - assert s_legacy.split == s_new.split == ir.SplitMode.UP_DOWN - - -def test_legacy_split_kwarg_matches_new_form(): - """Legacy top-level split= ≡ new optimizations=[pl.split(...)].""" - - with warnings.catch_warnings(): - warnings.simplefilter("ignore", DeprecationWarning) - - @pl.function - def legacy(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.LEFT_RIGHT): - y = pl.add(x, x) - return y - - @pl.function - def new(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.split(pl.SplitMode.LEFT_RIGHT)]): + def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: + with pl.at(level=pl.Level.CORE_GROUP): y = pl.add(x, x) return y - s_legacy = _find_scope(legacy.body, ir.InCoreScopeStmt) - s_new = _find_scope(new.body, ir.InCoreScopeStmt) - assert s_legacy is not None and s_new is not None - assert s_legacy.split == s_new.split == ir.SplitMode.LEFT_RIGHT - - -# ─── DeprecationWarning emission ────────────────────────────────────────────── - - -def test_legacy_optimization_kwarg_emits_deprecation_warning(): - """Using the legacy optimization= kwarg emits DeprecationWarning.""" - with pytest.warns(DeprecationWarning, match="optimizations=\\[pl.auto_chunk\\]"): - - @pl.function - def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, x) - return x - - -def test_legacy_split_kwarg_emits_deprecation_warning(): - """Using the legacy top-level split= kwarg emits DeprecationWarning.""" - with pytest.warns(DeprecationWarning, match="optimizations=\\[pl.split"): - - @pl.function - def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN): - y = pl.add(x, x) - return y - - -def test_new_optimizations_kwarg_emits_no_warning(): - """The new optimizations= API emits no DeprecationWarning.""" - with warnings.catch_warnings(): - warnings.simplefilter("error", DeprecationWarning) - - @pl.function - def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.split(pl.SplitMode.UP_DOWN)]): - y = pl.add(x, x) - return y - - -# ─── Hard errors when mixing new with deprecated kwargs ────────────────────── - - -def test_mix_optimizations_with_legacy_optimization_errors(): - """Cannot combine optimizations= with deprecated optimization=.""" - with pytest.raises(ParserSyntaxError, match="Cannot mix"): - - @pl.function - def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at( - level=pl.Level.CORE_GROUP, - optimizations=[pl.split(pl.SplitMode.UP_DOWN)], - optimization=pl.chunked_loop_optimizer, - ): - for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, x) - return x - - -def test_mix_optimizations_with_legacy_split_errors(): - """Cannot combine optimizations= with deprecated split=.""" - with pytest.raises(ParserSyntaxError, match="Cannot mix"): - - @pl.function - def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at( - level=pl.Level.CORE_GROUP, - optimizations=[pl.auto_chunk], - split=pl.SplitMode.UP_DOWN, - ): - for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, x) - return x + scope = _find_scope(f.body, ir.HierarchyScopeStmt) + assert scope is not None + assert scope.level == ir.Level.CORE_GROUP + assert scope.split is None # ─── Validation errors on optimizations= entries ────────────────────────────── @@ -289,23 +106,11 @@ def test_optimizations_must_be_list(): @pl.function def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimizations=pl.auto_chunk): # type: ignore[arg-type] + with pl.at(level=pl.Level.CORE_GROUP, optimizations=pl.split(pl.SplitMode.UP_DOWN)): # type: ignore[arg-type] y = pl.add(x, x) return y -def test_duplicate_auto_chunk_errors(): - """Two pl.auto_chunk entries in the same list is an error.""" - with pytest.raises(ParserSyntaxError, match="Duplicate.*auto_chunk"): - - @pl.function - def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.auto_chunk, pl.auto_chunk]): - for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, x) - return x - - def test_duplicate_split_errors(): """Two pl.split(...) entries in the same list is an error.""" with pytest.raises(ParserSyntaxError, match="Duplicate.*split"): @@ -343,28 +148,11 @@ def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: def test_split_factory_rejects_none_at_runtime(): - """pl.split() also rejects SplitMode.NONE at construction time. - - The parser-level check above catches DSL source. This factory-level - check guards runtime construction (e.g., in scripts that build Split - instances directly), per the project rule that DSL helpers should - validate user input rather than relying on backend C++ checks. - """ + """pl.split() also rejects SplitMode.NONE at construction time.""" with pytest.raises(ValueError, match=r"SplitMode\.NONE"): pl.split(pl.SplitMode.NONE) -def test_auto_chunk_on_non_core_group_errors(): - """pl.auto_chunk is only valid at CORE_GROUP.""" - with pytest.raises(ParserSyntaxError, match="CORE_GROUP"): - - @pl.function - def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.HOST, optimizations=[pl.auto_chunk]): - y = pl.add(x, x) - return y - - def test_split_on_non_core_group_errors(): """pl.split(...) is only valid at CORE_GROUP.""" with pytest.raises(ParserSyntaxError, match="CORE_GROUP"): @@ -379,20 +167,6 @@ def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: # ─── Fully qualified pl.optimizations.* forms ──────────────────────────────── -def test_fully_qualified_auto_chunk(): - """pl.optimizations.auto_chunk also works.""" - - @pl.function - def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.optimizations.auto_chunk]): - for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, x) - return x - - scope = _find_scope(f.body, ir.AutoInCoreScopeStmt) - assert scope is not None - - def test_fully_qualified_split(): """pl.optimizations.split(...) also works.""" @@ -405,7 +179,7 @@ def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: y = pl.add(x, x) return y - scope = _find_scope(f.body, ir.InCoreScopeStmt) + scope = _find_scope(f.body, ir.HierarchyScopeStmt) assert scope is not None assert scope.split == ir.SplitMode.UP_DOWN diff --git a/tests/ut/ir/parser/test_parse_pl_at.py b/tests/ut/ir/parser/test_parse_pl_at.py index 2a0589191..72376d181 100644 --- a/tests/ut/ir/parser/test_parse_pl_at.py +++ b/tests/ut/ir/parser/test_parse_pl_at.py @@ -7,9 +7,8 @@ # See LICENSE in the root of the software repository for the full text of the License. # ----------------------------------------------------------------------------------------------------------- -"""Tests for parsing pl.at(level=..., role=...) (Step 04).""" +"""Tests for parsing pl.at(level=..., role=...).""" -import warnings from typing import TypeVar import pypto.language as pl @@ -150,28 +149,13 @@ def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: # ─── Backward compatibility ─────────────────────────────────────────────── -def test_backward_compat_incore(): - """Existing pl.incore() still works.""" - - @pl.function - def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.incore(): - y = pl.add(x, x) - return y - - scope = _find_scope(f.body, ir.InCoreScopeStmt) - assert scope is not None - assert scope.scope_kind == ir.ScopeKind.InCore - assert not isinstance(scope, ir.HierarchyScopeStmt) - - def test_backward_compat_cluster(): """Existing pl.cluster() still works.""" @pl.function def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: with pl.cluster(): - with pl.incore(): + with pl.at(level=pl.Level.CORE_GROUP): y = pl.add(x, x) return y @@ -198,11 +182,11 @@ def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: assert "Role.Worker" in printed -# ─── New pl.at() InCore / AutoInCore forms ─────────────────────────────────── +# ─── pl.at() with CORE_GROUP level ─────────────────────────────────────── -def test_parse_pl_at_core_group_incore(): - """pl.at(level=CORE_GROUP) creates InCoreScopeStmt.""" +def test_parse_pl_at_core_group(): + """pl.at(level=CORE_GROUP) creates HierarchyScopeStmt at CORE_GROUP.""" @pl.function def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: @@ -210,81 +194,9 @@ def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: y = pl.add(x, x) return y - scope = _find_scope(f.body, ir.InCoreScopeStmt) - assert scope is not None - assert scope.scope_kind == ir.ScopeKind.InCore - - -def test_parse_pl_at_core_group_chunked_loop_optimizer_bare(): - """pl.at(level=CORE_GROUP, optimization=pl.chunked_loop_optimizer) → AutoInCore, split=UP_DOWN.""" - - @pl.function - def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, x) - return x - - scope = _find_scope(f.body, ir.AutoInCoreScopeStmt) - assert scope is not None - assert scope.scope_kind == ir.ScopeKind.AutoInCore - assert scope.split == ir.SplitMode.UP_DOWN - - -def test_parse_pl_at_core_group_chunked_loop_optimizer_with_split(): - """pl.at(level=CORE_GROUP, optimization=chunked_loop_optimizer(split=LEFT_RIGHT)) → AutoInCore.""" - - @pl.function - def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at( - level=pl.Level.CORE_GROUP, - optimization=pl.chunked_loop_optimizer(split=pl.SplitMode.LEFT_RIGHT), - ): - for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, x) - return x - - scope = _find_scope(f.body, ir.AutoInCoreScopeStmt) + scope = _find_scope(f.body, ir.HierarchyScopeStmt) assert scope is not None - assert scope.scope_kind == ir.ScopeKind.AutoInCore - assert scope.split == ir.SplitMode.LEFT_RIGHT - - -def test_parse_pl_at_optimization_on_non_core_group_errors(): - """optimization= is not supported for non-CORE_GROUP levels.""" - with pytest.raises(ParserSyntaxError, match="CORE_GROUP"): - - @pl.function - def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.HOST, optimization=pl.chunked_loop_optimizer): - y = pl.add(x, x) - return y - - -def test_parse_pl_at_unknown_optimization_errors(): - """optimization= with unsupported value raises error.""" - with pytest.raises(ParserSyntaxError, match="chunked_loop_optimizer"): - - @pl.function - def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=42): # type: ignore[arg-type] - y = pl.add(x, x) - return y - - -def test_parse_pl_at_split_mode_none_errors(): - """chunked_loop_optimizer(split=SplitMode.NONE) raises error.""" - with pytest.raises(ParserSyntaxError, match=r"SplitMode\.NONE"): - - @pl.function - def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at( - level=pl.Level.CORE_GROUP, - optimization=pl.chunked_loop_optimizer(split=pl.SplitMode.NONE), - ): - for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, x) - return x + assert scope.level == ir.Level.CORE_GROUP def test_parse_pl_at_role_with_core_group_errors(): @@ -298,138 +210,5 @@ def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: return y -def test_incore_deprecation_warning(): - """pl.incore() emits DeprecationWarning at parse time.""" - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - - @pl.function - def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.incore(): - y = pl.add(x, x) - return y - - assert any(issubclass(warning.category, DeprecationWarning) for warning in w) - assert any("pl.incore()" in str(warning.message) for warning in w) - - -def test_auto_incore_deprecation_warning(): - """pl.auto_incore() emits DeprecationWarning at parse time.""" - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - - @pl.function - def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.auto_incore(): - for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, x) - return x - - assert any(issubclass(warning.category, DeprecationWarning) for warning in w) - assert any("pl.auto_incore()" in str(warning.message) for warning in w) - - -# ─── InCore with split ────────────────────────────────────────────────────── - - -def test_parse_pl_incore_with_split(): - """pl.incore(split=UP_DOWN) creates InCoreScopeStmt with split.""" - - @pl.function - def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.incore(split=pl.SplitMode.UP_DOWN): - y = pl.add(x, x) - return y - - scope = _find_scope(f.body, ir.InCoreScopeStmt) - assert scope is not None - assert scope.scope_kind == ir.ScopeKind.InCore - assert scope.split == ir.SplitMode.UP_DOWN - - -def test_parse_pl_incore_with_split_left_right(): - """pl.incore(split=LEFT_RIGHT) creates InCoreScopeStmt with LEFT_RIGHT split.""" - - @pl.function - def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.incore(split=pl.SplitMode.LEFT_RIGHT): - y = pl.add(x, x) - return y - - scope = _find_scope(f.body, ir.InCoreScopeStmt) - assert scope is not None - assert scope.scope_kind == ir.ScopeKind.InCore - assert scope.split == ir.SplitMode.LEFT_RIGHT - - -def test_parse_pl_at_core_group_with_split(): - """pl.at(level=CORE_GROUP, split=UP_DOWN) creates InCoreScopeStmt with split.""" - - @pl.function - def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN): - y = pl.add(x, x) - return y - - scope = _find_scope(f.body, ir.InCoreScopeStmt) - assert scope is not None - assert scope.scope_kind == ir.ScopeKind.InCore - assert scope.split == ir.SplitMode.UP_DOWN - - -def test_parse_pl_at_core_group_with_split_left_right(): - """pl.at(level=CORE_GROUP, split=LEFT_RIGHT) creates InCoreScopeStmt with LEFT_RIGHT.""" - - @pl.function - def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.LEFT_RIGHT): - y = pl.add(x, x) - return y - - scope = _find_scope(f.body, ir.InCoreScopeStmt) - assert scope is not None - assert scope.scope_kind == ir.ScopeKind.InCore - assert scope.split == ir.SplitMode.LEFT_RIGHT - - -def test_parse_pl_at_optimization_and_split_conflict(): - """Cannot use both optimization= and split= in pl.at().""" - with pytest.raises(ParserSyntaxError, match="Cannot use both"): - - @pl.function - def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at( - level=pl.Level.CORE_GROUP, - optimization=pl.chunked_loop_optimizer, - split=pl.SplitMode.UP_DOWN, - ): - y = pl.add(x, x) - return y - - -def test_parse_pl_at_split_on_non_core_group_errors(): - """split= is not supported for non-CORE_GROUP levels.""" - with pytest.raises(ParserSyntaxError, match="CORE_GROUP"): - - @pl.function - def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.HOST, split=pl.SplitMode.UP_DOWN): - y = pl.add(x, x) - return y - - -def test_printer_incore_with_split_roundtrip(): - """Python printer renders InCore scope with split and it can be re-parsed.""" - - @pl.function - def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN): - y = pl.add(x, x) - return y - - printed = str(f) - assert "pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN)" in printed - - if __name__ == "__main__": pytest.main([__file__, "-v"]) diff --git a/tests/ut/ir/statements/test_scope_stmt.py b/tests/ut/ir/statements/test_scope_stmt.py index c8fca16fa..2ec7b1b86 100644 --- a/tests/ut/ir/statements/test_scope_stmt.py +++ b/tests/ut/ir/statements/test_scope_stmt.py @@ -7,7 +7,7 @@ # See LICENSE in the root of the software repository for the full text of the License. # ----------------------------------------------------------------------------------------------------------- -"""Unit tests for ScopeStmt class.""" +"""Unit tests for ScopeStmt class hierarchy.""" import pypto.language as pl import pytest @@ -17,35 +17,36 @@ class TestScopeStmt: """Test ScopeStmt construction, fields, and operations.""" - def test_scope_stmt_construction(self): - """Test basic InCoreScopeStmt construction.""" + def test_hierarchy_scope_construction(self): + """Test basic HierarchyScopeStmt construction at CORE_GROUP (replaces InCore scope).""" span = ir.Span("test.py", 1, 1, 1, 10) var_x = ir.Var("x", ir.TensorType([64], DataType.FP32), span) var_y = ir.Var("y", ir.TensorType([64], DataType.FP32), span) body = ir.AssignStmt(var_y, var_x, span) - scope = ir.InCoreScopeStmt(body=body, span=span) + scope = ir.HierarchyScopeStmt(level=ir.Level.CORE_GROUP, body=body, span=span) - assert scope.scope_kind == ir.ScopeKind.InCore + assert scope.scope_kind == ir.ScopeKind.Hierarchy + assert scope.level == ir.Level.CORE_GROUP assert isinstance(scope, ir.ScopeStmt) assert isinstance(scope.body, ir.AssignStmt) - def test_scope_stmt_structural_equality(self): - """Test structural equality for InCoreScopeStmt.""" + def test_hierarchy_scope_structural_equality(self): + """Test structural equality for HierarchyScopeStmt.""" span = ir.Span("test.py", 1, 1, 1, 10) var_x = ir.Var("x", ir.TensorType([64], DataType.FP32), span) var_y = ir.Var("y", ir.TensorType([64], DataType.FP32), span) body1 = ir.AssignStmt(var_y, var_x, span) - scope1 = ir.InCoreScopeStmt(body=body1, span=span) + scope1 = ir.HierarchyScopeStmt(level=ir.Level.CORE_GROUP, body=body1, span=span) body2 = ir.AssignStmt(var_y, var_x, span) - scope2 = ir.InCoreScopeStmt(body=body2, span=span) + scope2 = ir.HierarchyScopeStmt(level=ir.Level.CORE_GROUP, body=body2, span=span) assert ir.structural_equal(scope1, scope2) def test_scope_stmt_printing(self): - """Test Python printer output for ScopeStmt.""" + """Test Python printer output for HierarchyScopeStmt at CORE_GROUP.""" @pl.program class TestProgram: @@ -58,25 +59,25 @@ def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: printed = TestProgram.as_python() assert "with pl.at(level=pl.Level.CORE_GROUP):" in printed - def test_scope_stmt_with_name(self): - """Test InCoreScopeStmt construction with a user-provided name.""" + def test_hierarchy_scope_with_name(self): + """Test HierarchyScopeStmt construction with a user-provided name.""" span = ir.Span("test.py", 1, 1, 1, 10) var_x = ir.Var("x", ir.TensorType([64], DataType.FP32), span) var_y = ir.Var("y", ir.TensorType([64], DataType.FP32), span) body = ir.AssignStmt(var_y, var_x, span) - scope = ir.InCoreScopeStmt(name_hint="my_kernel", body=body, span=span) + scope = ir.HierarchyScopeStmt(level=ir.Level.CORE_GROUP, name_hint="my_kernel", body=body, span=span) assert scope.name_hint == "my_kernel" - assert scope.scope_kind == ir.ScopeKind.InCore + assert scope.scope_kind == ir.ScopeKind.Hierarchy - def test_scope_stmt_default_name_is_empty(self): + def test_hierarchy_scope_default_name_is_empty(self): """Test that default name is empty string.""" span = ir.Span("test.py", 1, 1, 1, 10) var_x = ir.Var("x", ir.TensorType([64], DataType.FP32), span) var_y = ir.Var("y", ir.TensorType([64], DataType.FP32), span) body = ir.AssignStmt(var_y, var_x, span) - scope = ir.InCoreScopeStmt(body=body, span=span) + scope = ir.HierarchyScopeStmt(level=ir.Level.CORE_GROUP, body=body, span=span) assert scope.name_hint == "" def test_spmd_scope_requires_positive_core_num(self): @@ -101,6 +102,28 @@ def test_hierarchy_scope_typed_fields(self): assert scope.role == ir.Role.Worker assert scope.scope_kind == ir.ScopeKind.Hierarchy + def test_hierarchy_scope_split_at_core_group(self): + """HierarchyScopeStmt accepts split at CORE_GROUP.""" + span = ir.Span("test.py", 1, 1, 1, 10) + var_x = ir.Var("x", ir.TensorType([64], DataType.FP32), span) + var_y = ir.Var("y", ir.TensorType([64], DataType.FP32), span) + body = ir.AssignStmt(var_y, var_x, span) + + scope = ir.HierarchyScopeStmt( + level=ir.Level.CORE_GROUP, split=ir.SplitMode.UP_DOWN, body=body, span=span + ) + assert scope.split == ir.SplitMode.UP_DOWN + + def test_hierarchy_scope_split_rejected_at_non_core_group(self): + """HierarchyScopeStmt rejects split at levels other than CORE_GROUP.""" + span = ir.Span("test.py", 1, 1, 1, 10) + var_x = ir.Var("x", ir.TensorType([64], DataType.FP32), span) + var_y = ir.Var("y", ir.TensorType([64], DataType.FP32), span) + body = ir.AssignStmt(var_y, var_x, span) + + with pytest.raises(ValueError, match="split is only valid at Level::CORE_GROUP"): + ir.HierarchyScopeStmt(level=ir.Level.HOST, split=ir.SplitMode.UP_DOWN, body=body, span=span) + if __name__ == "__main__": pytest.main([__file__, "-v"]) diff --git a/tests/ut/ir/statements/test_scope_stmt_hierarchy.py b/tests/ut/ir/statements/test_scope_stmt_hierarchy.py index eb0540e15..7f641e300 100644 --- a/tests/ut/ir/statements/test_scope_stmt_hierarchy.py +++ b/tests/ut/ir/statements/test_scope_stmt_hierarchy.py @@ -7,7 +7,7 @@ # See LICENSE in the root of the software repository for the full text of the License. # ----------------------------------------------------------------------------------------------------------- -"""Tests for ScopeStmt Hierarchy kind (Step 03).""" +"""Tests for the typed ScopeStmt class hierarchy.""" import pypto.language as pl import pytest @@ -22,7 +22,7 @@ def _span(): return ir.Span("test", 1, 0) -# ─── ScopeKind.Hierarchy value ──────────────────────────────────────────────── +# ─── ScopeKind values ──────────────────────────────────────────────────────── def test_hierarchy_scope_kind_exists(): @@ -30,23 +30,16 @@ def test_hierarchy_scope_kind_exists(): assert hasattr(ir.ScopeKind, "Hierarchy") -def test_hierarchy_scope_kind_distinct(): - """Hierarchy is distinct from existing ScopeKind values.""" - assert ir.ScopeKind.Hierarchy != ir.ScopeKind.InCore - assert ir.ScopeKind.Hierarchy != ir.ScopeKind.AutoInCore +def test_scope_kinds_are_distinct(): + """Each surviving ScopeKind is distinct.""" assert ir.ScopeKind.Hierarchy != ir.ScopeKind.Cluster + assert ir.ScopeKind.Hierarchy != ir.ScopeKind.Spmd + assert ir.ScopeKind.Cluster != ir.ScopeKind.Spmd # ─── Construction with derived classes (issue #1047) ──────────────────────── -def test_in_core_scope_construction(): - """InCoreScopeStmt construction works.""" - s = ir.InCoreScopeStmt(body=_empty_body(), span=_span()) - assert s.scope_kind == ir.ScopeKind.InCore - assert isinstance(s, ir.ScopeStmt) - - def test_cluster_scope_construction(): """ClusterScopeStmt construction works.""" s = ir.ClusterScopeStmt(body=_empty_body(), span=_span()) @@ -114,11 +107,11 @@ def test_structural_equal_different_role(): def test_structural_equal_different_kinds(): - """Different scope kinds (InCore vs Hierarchy) compare as unequal.""" - s_in = ir.InCoreScopeStmt(body=_empty_body(), span=_span()) + """Different scope kinds compare as unequal.""" + s_cluster = ir.ClusterScopeStmt(body=_empty_body(), span=_span()) s_hier = ir.HierarchyScopeStmt(level=ir.Level.HOST, body=_empty_body(), span=_span()) with pytest.raises(ValueError): - ir.assert_structural_equal(s_in, s_hier) + ir.assert_structural_equal(s_cluster, s_hier) # ─── Python printer ────────────────────────────────────────────────────────── @@ -134,46 +127,59 @@ def test_printer_hierarchy_scope(): assert "Role.Worker" in printed -def test_printer_incore_scope_unchanged(): +def test_printer_core_group_scope(): body = _empty_body() - scope = ir.InCoreScopeStmt(body=body, span=_span()) + scope = ir.HierarchyScopeStmt(level=ir.Level.CORE_GROUP, body=body, span=_span()) func = ir.Function("test_fn", [], [], scope, _span()) printed = str(func) assert "pl.at(level=pl.Level.CORE_GROUP)" in printed -def test_printer_incore_scope_with_split(): +def test_printer_core_group_scope_with_split(): body = _empty_body() - scope = ir.InCoreScopeStmt(split=ir.SplitMode.UP_DOWN, body=body, span=_span()) + scope = ir.HierarchyScopeStmt( + level=ir.Level.CORE_GROUP, split=ir.SplitMode.UP_DOWN, body=body, span=_span() + ) func = ir.Function("test_fn", [], [], scope, _span()) printed = str(func) - assert "pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN)" in printed + assert "pl.at(level=pl.Level.CORE_GROUP" in printed + assert "pl.split(pl.SplitMode.UP_DOWN)" in printed -def test_scope_stmt_incore_with_split(): - s = ir.InCoreScopeStmt(split=ir.SplitMode.UP_DOWN, body=_empty_body(), span=_span()) - assert s.scope_kind == ir.ScopeKind.InCore +def test_scope_stmt_core_group_with_split(): + s = ir.HierarchyScopeStmt( + level=ir.Level.CORE_GROUP, split=ir.SplitMode.UP_DOWN, body=_empty_body(), span=_span() + ) + assert s.scope_kind == ir.ScopeKind.Hierarchy assert s.split == ir.SplitMode.UP_DOWN -def test_structural_equal_incore_with_split(): - s1 = ir.InCoreScopeStmt(split=ir.SplitMode.UP_DOWN, body=_empty_body(), span=_span()) - s2 = ir.InCoreScopeStmt(split=ir.SplitMode.UP_DOWN, body=_empty_body(), span=_span()) +def test_structural_equal_core_group_with_split(): + s1 = ir.HierarchyScopeStmt( + level=ir.Level.CORE_GROUP, split=ir.SplitMode.UP_DOWN, body=_empty_body(), span=_span() + ) + s2 = ir.HierarchyScopeStmt( + level=ir.Level.CORE_GROUP, split=ir.SplitMode.UP_DOWN, body=_empty_body(), span=_span() + ) ir.assert_structural_equal(s1, s2) -def test_structural_equal_incore_different_split(): - s1 = ir.InCoreScopeStmt(split=ir.SplitMode.UP_DOWN, body=_empty_body(), span=_span()) - s2 = ir.InCoreScopeStmt(split=ir.SplitMode.LEFT_RIGHT, body=_empty_body(), span=_span()) +def test_structural_equal_core_group_different_split(): + s1 = ir.HierarchyScopeStmt( + level=ir.Level.CORE_GROUP, split=ir.SplitMode.UP_DOWN, body=_empty_body(), span=_span() + ) + s2 = ir.HierarchyScopeStmt( + level=ir.Level.CORE_GROUP, split=ir.SplitMode.LEFT_RIGHT, body=_empty_body(), span=_span() + ) with pytest.raises(ValueError): ir.assert_structural_equal(s1, s2) -# ─── Outline pass safety ───────────────────────────────────────────────────── +# ─── Outline pass ──────────────────────────────────────────────────────────── -def test_outline_incore_works_with_normal_program(): - """OutlineIncoreScopes works normally on programs without Hierarchy scopes.""" +def test_outline_hierarchy_works_with_core_group_program(): + """OutlineHierarchyScopes outlines CORE_GROUP scopes into Function(InCore).""" @pl.program class P: @@ -183,22 +189,9 @@ def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: y = pl.add(x, x) return y - After = passes.outline_incore_scopes()(P) + After = passes.outline_hierarchy_scopes()(P) assert After is not None -def test_scope_outliner_ignores_hierarchy_kind(): - """ScopeOutliner (used by OutlineIncoreScopes) only targets its configured - ScopeKind and naturally ignores Hierarchy scopes via the ScopeKind check.""" - # The ScopeOutliner matches on target_scope_kind_ (InCore or Cluster). - # ScopeKind::Hierarchy (value 3) != InCore (0) != Cluster (2), so - # the outliner's VisitStmt_ will skip it via: if (scope_kind_ != target_) return. - # We verify this property at the enum level since we can't inject a Hierarchy - # scope via the DSL parser yet (pl.at() parsing is Step 04). - assert ir.ScopeKind.Hierarchy != ir.ScopeKind.InCore - assert ir.ScopeKind.Hierarchy != ir.ScopeKind.Cluster - assert ir.ScopeKind.Hierarchy != ir.ScopeKind.AutoInCore - - if __name__ == "__main__": pytest.main([__file__, "-v"]) diff --git a/tests/ut/ir/transforms/test_ctrl_flow_transform.py b/tests/ut/ir/transforms/test_ctrl_flow_transform.py index 292b56e8e..8d8bf0e60 100644 --- a/tests/ut/ir/transforms/test_ctrl_flow_transform.py +++ b/tests/ut/ir/transforms/test_ctrl_flow_transform.py @@ -2023,13 +2023,14 @@ def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: return x after_ssa = passes.convert_to_ssa()(Input) - after_outline = passes.outline_incore_scopes()(after_ssa) + after_outline_h = passes.outline_hierarchy_scopes()(after_ssa) + after_outline = passes.outline_incore_scopes()(after_outline_h) After = passes.ctrl_flow_transform()(after_outline) @pl.program class Expected: - @pl.function(type=pl.FunctionType.InCore, strict_ssa=True) - def main_incore_0(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: # noqa: F841 + @pl.function(type=pl.FunctionType.InCore, level=pl.Level.CORE_GROUP, strict_ssa=True) + def main_core_group_0(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: # noqa: F841 for i, (x_iter,) in pl.range(10, init_values=(x_0,)): if i < 5: phi: pl.Tensor[[64], pl.FP32] = pl.yield_(x_iter) @@ -2041,7 +2042,7 @@ def main_incore_0(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP3 @pl.function(type=pl.FunctionType.Orchestration) def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - x_rv: pl.Tensor[[64], pl.FP32] = self.main_incore_0(x_0) + x_rv: pl.Tensor[[64], pl.FP32] = self.main_core_group_0(x_0) return x_rv ir.assert_structural_equal(After, Expected) diff --git a/tests/ut/ir/transforms/test_fuse_create_assemble_to_slice.py b/tests/ut/ir/transforms/test_fuse_create_assemble_to_slice.py index 540ce0c7a..c6f6e79a9 100644 --- a/tests/ut/ir/transforms/test_fuse_create_assemble_to_slice.py +++ b/tests/ut/ir/transforms/test_fuse_create_assemble_to_slice.py @@ -21,7 +21,6 @@ def _run_prereqs_only(program): pipeline.add_pass(passes.normalize_stmt_structure()) pipeline.add_pass(passes.flatten_call_expr()) pipeline.add_pass(passes.outline_hierarchy_scopes()) - pipeline.add_pass(passes.outline_incore_scopes()) pipeline.add_pass(passes.outline_cluster_scopes()) ctx = passes.PassContext([], passes.VerificationLevel.NONE) with ctx: @@ -35,7 +34,6 @@ def _run_prereqs_and_fuse(program): pipeline.add_pass(passes.normalize_stmt_structure()) pipeline.add_pass(passes.flatten_call_expr()) pipeline.add_pass(passes.outline_hierarchy_scopes()) - pipeline.add_pass(passes.outline_incore_scopes()) pipeline.add_pass(passes.outline_cluster_scopes()) pipeline.add_pass(passes.fuse_create_assemble_to_slice()) ctx = passes.PassContext([], passes.VerificationLevel.NONE) diff --git a/tests/ut/ir/transforms/test_interchange_chunk_loops.py b/tests/ut/ir/transforms/test_interchange_chunk_loops.py deleted file mode 100644 index 4e80963cf..000000000 --- a/tests/ut/ir/transforms/test_interchange_chunk_loops.py +++ /dev/null @@ -1,1144 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- - -"""Unit tests for InterchangeChunkLoops pass. - -Test strategy: - Build a `Before` program, run prerequisite passes + InterchangeChunkLoops, - and compare to an explicitly-constructed `Expected` program using - `ir.assert_structural_equal(..., enable_auto_mapping=True)`. -""" - -import re - -import pypto.language as pl -import pytest -from pypto import ir, passes -from pypto.ir.printer import python_print - - -def _prepare_for_interchange(program): - """Run prerequisite passes to produce input for InterchangeChunkLoops.""" - program = passes.unroll_loops()(program) - program = passes.convert_to_ssa()(program) - program = passes.flatten_call_expr()(program) - program = passes.split_chunked_loops()(program) - return program - - -class TestSingleParallelChunk: - """Tests for single parallel chunked loop (1 outer + 1 inner, InCore wrapping only).""" - - def test_single_parallel_chunk_gets_incore(self): - """Single parallel chunked loop: outer wraps InCore around inner.""" - - @pl.program - class Before: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, 1.0) - return x - - @pl.program - class Expected: - @pl.function - def main(self, x0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - for i0, (x1,) in pl.parallel( - 2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN): - for i1, (x2,) in pl.parallel( - 4, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - x3: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x2, 1.0) - x4: pl.Tensor[[64], pl.FP32] = pl.yield_(x3) - x5: pl.Tensor[[64], pl.FP32] = pl.yield_(x4) - return x5 - - After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before)) - ir.assert_structural_equal(After, Expected, enable_auto_mapping=True) - - -class TestNestedParallelChunks: - """Tests for nested parallel chunked loops (full interchange + InCore).""" - - def test_two_nested_parallel_divisible(self): - """Two nested parallel chunked loops, divisible: full interchange + InCore.""" - - @pl.program - class Before: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"): - for j in pl.parallel(0, 12, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, 1.0) - return x - - @pl.program - class Expected: - @pl.function - def main(self, x0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - for i0, (x1,) in pl.parallel( - 2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - for j0, (x2,) in pl.parallel( - 3, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN): - for i1, (x3,) in pl.parallel( - 4, init_values=(x2,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - for j1, (x4,) in pl.parallel( - 4, init_values=(x3,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - x5: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x4, 1.0) - x6: pl.Tensor[[64], pl.FP32] = pl.yield_(x5) - x7: pl.Tensor[[64], pl.FP32] = pl.yield_(x6) - x8: pl.Tensor[[64], pl.FP32] = pl.yield_(x7) - x9: pl.Tensor[[64], pl.FP32] = pl.yield_(x8) - return x9 - - After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before)) - ir.assert_structural_equal(After, Expected, enable_auto_mapping=True) - - def test_two_nested_parallel_with_iter_args(self): - """Two nested parallel chunked loops with iter_args: verify SSA threading. - - Same Before as ``test_two_nested_parallel_divisible`` — this test also - structurally confirms that iter_args thread correctly through every - level of the interchanged nest. - """ - - @pl.program - class Before: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"): - for j in pl.parallel(0, 12, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, 1.0) - return x - - @pl.program - class Expected: - @pl.function - def main(self, x0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - for i0, (x1,) in pl.parallel( - 2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - for j0, (x2,) in pl.parallel( - 3, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN): - for i1, (x3,) in pl.parallel( - 4, init_values=(x2,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - for j1, (x4,) in pl.parallel( - 4, init_values=(x3,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - x5: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x4, 1.0) - x6: pl.Tensor[[64], pl.FP32] = pl.yield_(x5) - x7: pl.Tensor[[64], pl.FP32] = pl.yield_(x6) - x8: pl.Tensor[[64], pl.FP32] = pl.yield_(x7) - x9: pl.Tensor[[64], pl.FP32] = pl.yield_(x8) - return x9 - - After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before)) - ir.assert_structural_equal(After, Expected, enable_auto_mapping=True) - - -class TestNestedChunkChainsInitSubstitution: - """Tests that nested chunk chains correctly substitute init_values from parent chain.""" - - def test_nested_chains_init_values_substituted(self): - """Nested parallel chunk chains: inner chain init_values reference parent's - rewritten iter_args, not the original pre-interchange names.""" - - @pl.program - class Before: - @pl.function - def main( - self, - x: pl.Tensor[[64], pl.FP32], - y: pl.Tensor[[64], pl.FP32], - ) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for b in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"): - for h in pl.parallel(0, 12, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, y) - return x - - @pl.program - class Expected: - @pl.function - def main( - self, - x0: pl.Tensor[[64], pl.FP32], - y0: pl.Tensor[[64], pl.FP32], - ) -> pl.Tensor[[64], pl.FP32]: - for b0, (x1,) in pl.parallel( - 2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - for h0, (x2,) in pl.parallel( - 3, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN): - for b1, (x3,) in pl.parallel( - 4, init_values=(x2,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - for h1, (x4,) in pl.parallel( - 4, init_values=(x3,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - x5: pl.Tensor[[64], pl.FP32] = pl.tensor.add(x4, y0) - x6: pl.Tensor[[64], pl.FP32] = pl.yield_(x5) - x7: pl.Tensor[[64], pl.FP32] = pl.yield_(x6) - x8: pl.Tensor[[64], pl.FP32] = pl.yield_(x7) - x9: pl.Tensor[[64], pl.FP32] = pl.yield_(x8) - return x9 - - After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before)) - ir.assert_structural_equal(After, Expected, enable_auto_mapping=True) - - def test_nested_chains_outline_no_crash(self): - """Nested parallel chunk chains followed by OutlineIncoreScopes must not crash. - - This is the end-to-end scenario from DeepSeekV3 decode that triggered the - 'Variable ... not found in symbol table' crash. - """ - - @pl.program - class Input: - @pl.function - def main( - self, - x: pl.Tensor[[64], pl.FP32], - y: pl.Tensor[[64], pl.FP32], - ) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for b in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"): - for h in pl.parallel(0, 12, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, y) - return x - - program = _prepare_for_interchange(Input) - program = passes.interchange_chunk_loops()(program) - # This should not raise "Variable ... not found in symbol table" - program = passes.outline_incore_scopes()(program) - - incore_funcs = [f for f in program.functions.values() if f.func_type == ir.FunctionType.InCore] - assert len(incore_funcs) >= 1 - - def test_nested_chains_with_remainder_outline_no_crash(self): - """Nested chains with remainder: outline must not crash on substituted init_values.""" - - @pl.program - class Input: - @pl.function - def main( - self, - x: pl.Tensor[[64], pl.FP32], - y: pl.Tensor[[64], pl.FP32], - ) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for b in pl.parallel(0, 6, 1, chunk=4, chunk_policy="leading_full"): - for h in pl.parallel(0, 14, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, y) - return x - - program = _prepare_for_interchange(Input) - program = passes.interchange_chunk_loops()(program) - program = passes.outline_incore_scopes()(program) - - incore_funcs = [f for f in program.functions.values() if f.func_type == ir.FunctionType.InCore] - assert len(incore_funcs) >= 1 - - -class TestNestedChunksWithInterveningStatements: - """Tests for nested chunked parallel loops with intervening statements (issue #911).""" - - @staticmethod - def _make_input(): - @pl.program - class Input: - @pl.function - def main( - self, - x: pl.Tensor[[64], pl.FP32], - y: pl.Tensor[[64], pl.FP32], - ) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for b in pl.parallel(0, 16, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, y) - for h in pl.parallel(0, 8, 1, chunk=2, chunk_policy="leading_full"): - x = pl.add(x, y) - return x - - return Input - - def test_no_nested_incore_with_intervening_stmt(self): - """Nested chunks with intervening add: single InCore, no nesting.""" - - @pl.program - class Before: - @pl.function - def main( - self, - x: pl.Tensor[[64], pl.FP32], - y: pl.Tensor[[64], pl.FP32], - ) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for b in pl.parallel(0, 16, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, y) - for h in pl.parallel(0, 8, 1, chunk=2, chunk_policy="leading_full"): - x = pl.add(x, y) - return x - - @pl.program - class Expected: - @pl.function - def main( - self, - x0: pl.Tensor[[64], pl.FP32], - y0: pl.Tensor[[64], pl.FP32], - ) -> pl.Tensor[[64], pl.FP32]: - for b0, (x1,) in pl.parallel( - 4, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN): - for b1, (x2,) in pl.parallel( - 4, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - x3: pl.Tensor[[64], pl.FP32] = pl.tensor.add(x2, y0) - for h0, (x4,) in pl.parallel( - 4, init_values=(x3,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - for h1, (x5,) in pl.parallel( - 2, init_values=(x4,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - x6: pl.Tensor[[64], pl.FP32] = pl.tensor.add(x5, y0) - x7: pl.Tensor[[64], pl.FP32] = pl.yield_(x6) - x8: pl.Tensor[[64], pl.FP32] = pl.yield_(x7) - x9: pl.Tensor[[64], pl.FP32] = pl.yield_(x8) - x10: pl.Tensor[[64], pl.FP32] = pl.yield_(x9) - return x10 - - After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before)) - ir.assert_structural_equal(After, Expected, enable_auto_mapping=True) - - def test_outline_no_crash_with_intervening_stmt(self): - """Nested chunks with intervening stmt: outline must not crash.""" - program = _prepare_for_interchange(self._make_input()) - program = passes.interchange_chunk_loops()(program) - # This must not crash with nested InCore or missing operator - program = passes.outline_incore_scopes()(program) - - incore_funcs = [f for f in program.functions.values() if f.func_type == ir.FunctionType.InCore] - assert len(incore_funcs) >= 1 - - -class TestChunkWithRemainderInChain: - """Tests for chunk chains that include remainder loops (non-divisible inner).""" - - def test_chunk_outer_inner_with_remainder_preserves_iter_args(self): - """Chunk chain with trailing remainder: iter_args thread through inner, remainder preserved.""" - - @pl.program - class Before: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"): - for j in pl.parallel(0, 1, 1, chunk=2, chunk_policy="leading_full"): - x = pl.add(x, 1.0) - return x - - @pl.program - class Expected: - @pl.function - def main(self, x0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - for i0, (x1,) in pl.parallel( - 2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN): - for i1, (x2,) in pl.parallel( - 4, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - for j0, (x3,) in pl.parallel( - 1, init_values=(x2,), attrs={"loop_origin": pl.LoopOrigin.ChunkRemainder} - ): - x4: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x3, 1.0) - x5: pl.Tensor[[64], pl.FP32] = pl.yield_(x4) - x6: pl.Tensor[[64], pl.FP32] = pl.yield_(x5) - x7: pl.Tensor[[64], pl.FP32] = pl.yield_(x6) - return x7 - - After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before)) - ir.assert_structural_equal(After, Expected, enable_auto_mapping=True) - - def test_chunk_with_remainder_body_contains_remainder_loop(self): - """Remainder loop inside chain body is preserved after interchange. - - Same Before as ``test_chunk_outer_inner_with_remainder_preserves_iter_args`` - — the matching Expected confirms the remainder loop structurally survives. - """ - - @pl.program - class Before: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"): - for j in pl.parallel(0, 1, 1, chunk=2, chunk_policy="leading_full"): - x = pl.add(x, 1.0) - return x - - @pl.program - class Expected: - @pl.function - def main(self, x0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - for i0, (x1,) in pl.parallel( - 2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN): - for i1, (x2,) in pl.parallel( - 4, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - for j0, (x3,) in pl.parallel( - 1, init_values=(x2,), attrs={"loop_origin": pl.LoopOrigin.ChunkRemainder} - ): - x4: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x3, 1.0) - x5: pl.Tensor[[64], pl.FP32] = pl.yield_(x4) - x6: pl.Tensor[[64], pl.FP32] = pl.yield_(x5) - x7: pl.Tensor[[64], pl.FP32] = pl.yield_(x6) - return x7 - - After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before)) - ir.assert_structural_equal(After, Expected, enable_auto_mapping=True) - - -class TestRemainderLoops: - """Tests for non-divisible cases with remainder loops.""" - - def test_non_divisible_with_remainder(self): - """Non-divisible with remainder: main chunk gets interchange, remainder gets InCore.""" - - @pl.program - class Before: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.parallel(0, 6, 1, chunk=4, chunk_policy="leading_full"): - for j in pl.parallel(0, 14, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, 1.0) - return x - - @pl.program - class Expected: - @pl.function - def main(self, x0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - for i0, (x1,) in pl.parallel( - 1, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN): - for i1, (x2,) in pl.parallel( - 4, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - for j0, (x3,) in pl.parallel( - 3, init_values=(x2,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - for j1, (x4,) in pl.parallel( - 4, init_values=(x3,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - x5: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x4, 1.0) - x6: pl.Tensor[[64], pl.FP32] = pl.yield_(x5) - x7: pl.Tensor[[64], pl.FP32] = pl.yield_(x6) - for j2, (x8,) in pl.parallel( - 2, init_values=(x7,), attrs={"loop_origin": pl.LoopOrigin.ChunkRemainder} - ): - x9: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x8, 1.0) - x10: pl.Tensor[[64], pl.FP32] = pl.yield_(x9) - x11: pl.Tensor[[64], pl.FP32] = pl.yield_(x10) - x12: pl.Tensor[[64], pl.FP32] = pl.yield_(x11) - for i2, (x13,) in pl.parallel( - 2, init_values=(x12,), attrs={"loop_origin": pl.LoopOrigin.ChunkRemainder} - ): - for j3, (x14,) in pl.parallel( - 3, init_values=(x13,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN): - for j4, (x15,) in pl.parallel( - 4, init_values=(x14,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - x16: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x15, 1.0) - x17: pl.Tensor[[64], pl.FP32] = pl.yield_(x16) - x18: pl.Tensor[[64], pl.FP32] = pl.yield_(x17) - with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN): - for j5, (x19,) in pl.parallel( - 2, init_values=(x18,), attrs={"loop_origin": pl.LoopOrigin.ChunkRemainder} - ): - x20: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x19, 1.0) - x21: pl.Tensor[[64], pl.FP32] = pl.yield_(x20) - x22: pl.Tensor[[64], pl.FP32] = pl.yield_(x21) - return x22 - - After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before)) - ir.assert_structural_equal(After, Expected, enable_auto_mapping=True) - - -class TestNonChunkedLoops: - """Tests for loops that should pass through unchanged.""" - - def test_non_chunked_loop_unchanged(self): - """Regular (non-chunked) loops pass through untouched.""" - - @pl.program - class Before: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - for i in pl.range(0, 10, 1): - x = pl.add(x, 1.0) - return x - - @pl.program - class Expected: - @pl.function - def main(self, x0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - for i0, (x1,) in pl.range(10, init_values=(x0,)): - x2: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x1, 1.0) - x3: pl.Tensor[[64], pl.FP32] = pl.yield_(x2) - return x3 - - After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before)) - ir.assert_structural_equal(After, Expected, enable_auto_mapping=True) - - -class TestSequentialChunks: - """Tests for sequential chunked loops (should NOT interchange but get InCore wrapping).""" - - def test_sequential_chunk_gets_incore(self): - """Sequential chunked loop inside auto_incore: gets InCore wrapping.""" - - @pl.program - class Before: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.range(0, 8, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, 1.0) - return x - - @pl.program - class Expected: - @pl.function - def main(self, x0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN): - for i0, (x1,) in pl.range( - 2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - for i1, (x2,) in pl.range( - 4, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - x3: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x2, 1.0) - x4: pl.Tensor[[64], pl.FP32] = pl.yield_(x3) - x5: pl.Tensor[[64], pl.FP32] = pl.yield_(x4) - return x5 - - After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before)) - ir.assert_structural_equal(After, Expected, enable_auto_mapping=True) - - def test_nested_sequential_chunks_get_incore(self): - """Nested sequential chunked loops: no interchange, but get InCore wrapping.""" - - @pl.program - class Before: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.range(0, 8, 1, chunk=4, chunk_policy="leading_full"): - for j in pl.range(0, 12, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, 1.0) - return x - - @pl.program - class Expected: - @pl.function - def main(self, x0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN): - for i0, (x1,) in pl.range( - 2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - for i1, (x2,) in pl.range( - 4, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - for j0, (x3,) in pl.range( - 3, init_values=(x2,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - for j1, (x4,) in pl.range( - 4, init_values=(x3,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - x5: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x4, 1.0) - x6: pl.Tensor[[64], pl.FP32] = pl.yield_(x5) - x7: pl.Tensor[[64], pl.FP32] = pl.yield_(x6) - x8: pl.Tensor[[64], pl.FP32] = pl.yield_(x7) - x9: pl.Tensor[[64], pl.FP32] = pl.yield_(x8) - return x9 - - After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before)) - ir.assert_structural_equal(After, Expected, enable_auto_mapping=True) - - -class TestExistingInCore: - """Tests for loops with existing InCore scope (should skip interchange).""" - - def test_existing_incore_skip(self): - """Body already has ScopeStmt(InCore): pass through unchanged by interchange.""" - - @pl.program - class Before: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"): - with pl.at(level=pl.Level.CORE_GROUP): - x = pl.add(x, 1.0) - return x - - @pl.program - class Expected: - @pl.function - def main(self, x0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - for i0, (x1,) in pl.parallel( - 2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - for i1, (x2,) in pl.parallel( - 4, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - with pl.at(level=pl.Level.CORE_GROUP): - x3: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x2, 1.0) - x4: pl.Tensor[[64], pl.FP32] = pl.yield_(x3) - x5: pl.Tensor[[64], pl.FP32] = pl.yield_(x4) - return x5 - - After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before)) - ir.assert_structural_equal(After, Expected, enable_auto_mapping=True) - - -class TestAutoIncoreConsumed: - """Tests that auto_incore scope is consumed by InterchangeChunkLoops.""" - - def test_auto_incore_consumed(self): - """AutoInCore scope should be removed after InterchangeChunkLoops. - - Same Before as ``TestSingleParallelChunk::test_single_parallel_chunk_gets_incore`` - — the Expected has no ``chunked_loop_optimizer`` marker, structurally - asserting the AutoInCore scope was consumed. - """ - - @pl.program - class Before: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, 1.0) - return x - - @pl.program - class Expected: - @pl.function - def main(self, x0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - for i0, (x1,) in pl.parallel( - 2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN): - for i1, (x2,) in pl.parallel( - 4, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - x3: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x2, 1.0) - x4: pl.Tensor[[64], pl.FP32] = pl.yield_(x3) - x5: pl.Tensor[[64], pl.FP32] = pl.yield_(x4) - return x5 - - After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before)) - ir.assert_structural_equal(After, Expected, enable_auto_mapping=True) - - -class TestPassProperties: - """Tests for pass properties and factory.""" - - def test_pass_name(self): - """Pass has correct name.""" - p = passes.interchange_chunk_loops() - assert p.get_name() == "InterchangeChunkLoops" - - def test_pass_required_properties(self): - """Pass requires SSAForm (TypeChecked is a structural property).""" - p = passes.interchange_chunk_loops() - req = p.get_required_properties() - assert req.contains(passes.IRProperty.SSAForm) - - def test_pass_produced_properties(self): - """Pass produces SSAForm (TypeChecked is a structural property).""" - p = passes.interchange_chunk_loops() - prod = p.get_produced_properties() - assert prod.contains(passes.IRProperty.SSAForm) - - -class TestNoNestedIncoreVerifier: - """Tests for the NoNestedInCore structural property verifier (issue #912).""" - - def test_no_nested_incore_is_structural_property(self): - """NoNestedInCore is in the structural property set.""" - structural = passes.get_structural_properties() - assert structural.contains(passes.IRProperty.NoNestedInCore) - - def test_verifier_passes_on_valid_ir(self): - """Verifier passes when InterchangeChunkLoops produces valid (non-nested) InCore.""" - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, 1.0) - return x - - program = _prepare_for_interchange(Input) - program = passes.interchange_chunk_loops()(program) - - props = passes.IRPropertySet() - props.insert(passes.IRProperty.NoNestedInCore) - diagnostics = passes.PropertyVerifierRegistry.verify(props, program) - errors = [d for d in diagnostics if d.severity == passes.DiagnosticSeverity.Error] - assert len(errors) == 0 - - def test_verifier_passes_with_intervening_stmts(self): - """Verifier passes on fixed nested chunks with intervening statements.""" - - @pl.program - class Input: - @pl.function - def main( - self, - x: pl.Tensor[[64], pl.FP32], - y: pl.Tensor[[64], pl.FP32], - ) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for b in pl.parallel(0, 16, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, y) - for h in pl.parallel(0, 8, 1, chunk=2, chunk_policy="leading_full"): - x = pl.add(x, y) - return x - - program = _prepare_for_interchange(Input) - program = passes.interchange_chunk_loops()(program) - - props = passes.IRPropertySet() - props.insert(passes.IRProperty.NoNestedInCore) - diagnostics = passes.PropertyVerifierRegistry.verify(props, program) - errors = [d for d in diagnostics if d.severity == passes.DiagnosticSeverity.Error] - assert len(errors) == 0 - - -class TestNonChunkStatementsWrapping: - """Tests that non-chunk statements inside auto_incore get InCore wrapping.""" - - def test_standalone_tensor_op_wrapped(self): - """Standalone tensor op inside auto_incore gets wrapped in InCore.""" - - @pl.program - class Before: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - x = pl.add(x, 1.0) - return x - - @pl.program - class Expected: - @pl.function - def main(self, x0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN): - x1: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x0, 1.0) - return x1 - - After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before)) - ir.assert_structural_equal(After, Expected, enable_auto_mapping=True) - - def test_standalone_op_before_parallel_chunk(self): - """Standalone op before parallel chunk: op wrapped separately, chunk interchanged.""" - - @pl.program - class Before: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - x = pl.add(x, 1.0) - for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, 2.0) - return x - - @pl.program - class Expected: - @pl.function - def main(self, x0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN): - x1: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x0, 1.0) - for i0, (x2,) in pl.parallel( - 2, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN): - for i1, (x3,) in pl.parallel( - 4, init_values=(x2,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - x4: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x3, 2.0) - x5: pl.Tensor[[64], pl.FP32] = pl.yield_(x4) - x6: pl.Tensor[[64], pl.FP32] = pl.yield_(x5) - return x6 - - After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before)) - ir.assert_structural_equal(After, Expected, enable_auto_mapping=True) - - def test_standalone_op_after_parallel_chunk(self): - """Standalone op after parallel chunk: chunk interchanged, op wrapped separately.""" - - @pl.program - class Before: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, 2.0) - x = pl.mul(x, 3.0) - return x - - @pl.program - class Expected: - @pl.function - def main(self, x0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - for i0, (x1,) in pl.parallel( - 2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN): - for i1, (x2,) in pl.parallel( - 4, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - x3: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x2, 2.0) - x4: pl.Tensor[[64], pl.FP32] = pl.yield_(x3) - x5: pl.Tensor[[64], pl.FP32] = pl.yield_(x4) - with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN): - x6: pl.Tensor[[64], pl.FP32] = pl.tensor.muls(x5, 3.0) - return x6 - - After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before)) - ir.assert_structural_equal(After, Expected, enable_auto_mapping=True) - - def test_host_side_assemble_after_parallel_chunk_not_wrapped(self): - """Host-side tail assemble after a chunk stays outside InCore.""" - - @pl.program - class Before: - @pl.function - def main(self, x: pl.Tensor[[4], pl.FP32]) -> pl.Tensor[[8], pl.FP32]: - out_0: pl.Tensor[[8], pl.FP32] = pl.tensor.create( - [8], dtype=pl.FP32, layout=pl.TensorLayout.ND - ) - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.parallel(0, 4, 1, chunk=2, chunk_policy="leading_full"): - x = pl.tensor.adds(x, 1.0) - out_1: pl.Tensor[[8], pl.FP32] = pl.tensor.assemble(out_0, x, [0]) - return out_1 - - @pl.program - class Expected: - @pl.function - def main(self, x0: pl.Tensor[[4], pl.FP32]) -> pl.Tensor[[8], pl.FP32]: - out_0_0: pl.Tensor[[8], pl.FP32] = pl.tensor.create( - [8], dtype=pl.FP32, layout=pl.TensorLayout.ND - ) - for i0, (x1,) in pl.parallel( - 2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN): - for i1, (x2,) in pl.parallel( - 2, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - x3: pl.Tensor[[4], pl.FP32] = pl.tensor.adds(x2, 1.0) - x4: pl.Tensor[[4], pl.FP32] = pl.yield_(x3) - x5: pl.Tensor[[4], pl.FP32] = pl.yield_(x4) - out_1_0: pl.Tensor[[8], pl.FP32] = pl.tensor.assemble(out_0_0, x5, [0]) - return out_1_0 - - After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before)) - ir.assert_structural_equal(After, Expected, enable_auto_mapping=True) - - def test_multiple_parallel_chunks_no_regression(self): - """Multiple parallel chunks with no standalone ops: all interchanged, no extra wrapping.""" - - @pl.program - class Before: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, 1.0) - for j in pl.parallel(0, 12, 1, chunk=4, chunk_policy="leading_full"): - x = pl.mul(x, 2.0) - return x - - @pl.program - class Expected: - @pl.function - def main(self, x0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - for i0, (x1,) in pl.parallel( - 2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN): - for i1, (x2,) in pl.parallel( - 4, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - x3: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x2, 1.0) - x4: pl.Tensor[[64], pl.FP32] = pl.yield_(x3) - x5: pl.Tensor[[64], pl.FP32] = pl.yield_(x4) - for j0, (x6,) in pl.parallel( - 3, init_values=(x5,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN): - for j1, (x7,) in pl.parallel( - 4, init_values=(x6,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - x8: pl.Tensor[[64], pl.FP32] = pl.tensor.muls(x7, 2.0) - x9: pl.Tensor[[64], pl.FP32] = pl.yield_(x8) - x10: pl.Tensor[[64], pl.FP32] = pl.yield_(x9) - return x10 - - After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before)) - ir.assert_structural_equal(After, Expected, enable_auto_mapping=True) - - def test_non_chunked_loop_inside_auto_incore_wrapped(self): - """Non-chunked loop with tensor ops inside auto_incore gets wrapped in InCore.""" - - @pl.program - class Before: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.range(10): - x = pl.add(x, 1.0) - return x - - @pl.program - class Expected: - @pl.function - def main(self, x0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN): - for i0, (x1,) in pl.range(10, init_values=(x0,)): - x2: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x1, 1.0) - x3: pl.Tensor[[64], pl.FP32] = pl.yield_(x2) - return x3 - - After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before)) - ir.assert_structural_equal(After, Expected, enable_auto_mapping=True) - - def test_mixed_parallel_and_sequential_chunks(self): - """Mixed parallel chunk + sequential chunk: parallel interchanged, sequential wrapped.""" - - @pl.program - class Before: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, 1.0) - for j in pl.range(0, 12, 1, chunk=4, chunk_policy="leading_full"): - x = pl.mul(x, 2.0) - return x - - @pl.program - class Expected: - @pl.function - def main(self, x0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - for i0, (x1,) in pl.parallel( - 2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN): - for i1, (x2,) in pl.parallel( - 4, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - x3: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x2, 1.0) - x4: pl.Tensor[[64], pl.FP32] = pl.yield_(x3) - x5: pl.Tensor[[64], pl.FP32] = pl.yield_(x4) - with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN): - for j0, (x6,) in pl.range( - 3, init_values=(x5,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - for j1, (x7,) in pl.range( - 4, init_values=(x6,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - x8: pl.Tensor[[64], pl.FP32] = pl.tensor.muls(x7, 2.0) - x9: pl.Tensor[[64], pl.FP32] = pl.yield_(x8) - x10: pl.Tensor[[64], pl.FP32] = pl.yield_(x9) - return x10 - - After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before)) - ir.assert_structural_equal(After, Expected, enable_auto_mapping=True) - - -class TestScalarAssignmentNotWrapped: - """Tests that pure scalar assignments stay outside InCore scopes.""" - - def test_scalar_assign_adjacent_to_compute_not_wrapped(self): - """Scalar assignment adjacent to tensor compute ops stays in orchestration.""" - - @pl.program - class Before: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for ob in pl.range(0, 8): - offset: pl.Scalar[pl.INDEX] = ob * 4 # noqa: F841 - x = pl.add(x, 1.0) - for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, 2.0) - return x - - @pl.program - class Expected: - @pl.function - def main(self, x0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - for ob0, (x1,) in pl.range(8, init_values=(x0,)): - offset0: pl.Scalar[pl.INDEX] = ob0 * 4 - with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN): - x2: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x1, 1.0) - for i0, (x3,) in pl.parallel( - 2, init_values=(x2,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN): - for i1, (x4,) in pl.parallel( - 4, init_values=(x3,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - x5: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x4, 2.0) - x6: pl.Tensor[[64], pl.FP32] = pl.yield_(x5) - x7: pl.Tensor[[64], pl.FP32] = pl.yield_(x6) - x8: pl.Tensor[[64], pl.FP32] = pl.yield_(x7) - return x8 - - After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before)) - ir.assert_structural_equal(After, Expected, enable_auto_mapping=True) - - def test_scalar_assign_not_wrapped_outline_no_crash(self): - """Scalar assignment stays in orchestration after outline — no undefined variable.""" - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for ob in pl.range(0, 8): - offset: pl.Scalar[pl.INDEX] = ob * 4 # noqa: F841 - for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, 2.0) - return x - - program = _prepare_for_interchange(Input) - program = passes.interchange_chunk_loops()(program) - # This should not crash with undefined variable references - program = passes.outline_incore_scopes()(program) - - incore_funcs = [f for f in program.functions.values() if f.func_type == ir.FunctionType.InCore] - assert len(incore_funcs) >= 1 - - -class TestEndToEndNoComputeLeaks: - """End-to-end tests verifying no compute tensor ops leak into Orchestration.""" - - def _run_through_outline(self, program): - """Run prerequisite passes + interchange + outline.""" - program = _prepare_for_interchange(program) - program = passes.interchange_chunk_loops()(program) - program = passes.outline_incore_scopes()(program) - return program - - # Host-side tensor ops that are allowed in Orchestration - _HOST_SIDE_OPS = { - "tensor.create", - "tensor.read", - "tensor.write", - "tensor.slice", - "tensor.assemble", - "tensor.dim", - "tensor.reshape", - "tensor.transpose", - } - - def _assert_no_compute_leaks(self, program, min_incore_funcs=1): - """Assert no compute tensor ops in Orchestration and enough InCore functions exist.""" - for func in program.functions.values(): - if func.func_type == ir.FunctionType.Orchestration: - func_str = python_print(func) - for match in re.findall(r"tensor\.\w+", func_str): - assert match in self._HOST_SIDE_OPS, ( - f"Compute tensor op '{match}' leaked into Orchestration" - ) - - incore_funcs = [f for f in program.functions.values() if f.func_type == ir.FunctionType.InCore] - assert len(incore_funcs) >= min_incore_funcs - - def test_standalone_op_outlined(self): - """Standalone op inside auto_incore: outlined into InCore function.""" - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - x = pl.add(x, 1.0) - return x - - After = self._run_through_outline(Input) - self._assert_no_compute_leaks(After, min_incore_funcs=1) - - def test_mix_standalone_and_parallel_chunk_outlined(self): - """Mix of standalone + parallel chunk: two InCore functions, orchestration clean.""" - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - x = pl.add(x, 1.0) - for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, 2.0) - return x - - After = self._run_through_outline(Input) - self._assert_no_compute_leaks(After, min_incore_funcs=2) - - def test_sequential_chunk_outlined(self): - """Sequential chunk inside auto_incore: one InCore function containing the whole loop chain.""" - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.range(0, 8, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, 1.0) - return x - - After = self._run_through_outline(Input) - self._assert_no_compute_leaks(After, min_incore_funcs=1) - - -if __name__ == "__main__": - pytest.main([__file__, "-v"]) diff --git a/tests/ut/ir/transforms/test_ir_property.py b/tests/ut/ir/transforms/test_ir_property.py index 811d82d99..6f22a73ae 100644 --- a/tests/ut/ir/transforms/test_ir_property.py +++ b/tests/ut/ir/transforms/test_ir_property.py @@ -23,7 +23,7 @@ def test_property_values_exist(self): assert passes.IRProperty.NoNestedCalls is not None assert passes.IRProperty.NormalizedStmtStructure is not None assert passes.IRProperty.NoRedundantBlocks is not None - assert passes.IRProperty.SplitIncoreOrch is not None + assert passes.IRProperty.HierarchyOutlined is not None assert passes.IRProperty.HasMemRefs is not None def test_property_values_are_different(self): @@ -34,7 +34,7 @@ def test_property_values_are_different(self): passes.IRProperty.NoNestedCalls, passes.IRProperty.NormalizedStmtStructure, passes.IRProperty.NoRedundantBlocks, - passes.IRProperty.SplitIncoreOrch, + passes.IRProperty.HierarchyOutlined, passes.IRProperty.HasMemRefs, ] assert len(props) == len(set(props)) @@ -185,12 +185,23 @@ def test_flatten_call_expr_requires_and_produces_ssa(self): assert p.get_produced_properties().contains(passes.IRProperty.SSAForm) assert p.get_produced_properties().contains(passes.IRProperty.NoNestedCalls) - def test_outline_incore_requires_and_produces_ssa(self): - """Test OutlineIncoreScopes requires and produces SSAForm.""" + def test_outline_hierarchy_requires_and_produces_ssa(self): + """Test OutlineHierarchyScopes requires and produces SSAForm. + + HierarchyOutlined is *not* produced here — CORE_GROUP scopes survive this + pass and are outlined by OutlineIncoreScopes, which produces the property. + """ + p = passes.outline_hierarchy_scopes() + assert p.get_required_properties().contains(passes.IRProperty.SSAForm) + assert p.get_produced_properties().contains(passes.IRProperty.SSAForm) + assert not p.get_produced_properties().contains(passes.IRProperty.HierarchyOutlined) + + def test_outline_incore_requires_ssa_produces_hierarchy_outlined(self): + """OutlineIncoreScopes requires SSAForm and produces SSAForm + HierarchyOutlined.""" p = passes.outline_incore_scopes() assert p.get_required_properties().contains(passes.IRProperty.SSAForm) assert p.get_produced_properties().contains(passes.IRProperty.SSAForm) - assert p.get_produced_properties().contains(passes.IRProperty.SplitIncoreOrch) + assert p.get_produced_properties().contains(passes.IRProperty.HierarchyOutlined) def test_outline_cluster_requires_and_produces_ssa(self): """Test OutlineClusterScopes requires and produces SSAForm.""" diff --git a/tests/ut/ir/transforms/test_normalize_return_order.py b/tests/ut/ir/transforms/test_normalize_return_order.py index 461cdada7..8d69bcd0b 100644 --- a/tests/ut/ir/transforms/test_normalize_return_order.py +++ b/tests/ut/ir/transforms/test_normalize_return_order.py @@ -378,7 +378,7 @@ def test_pass_name(self): def test_required_properties(self): p = passes.normalize_return_order() required = p.get_required_properties() - assert required.contains(passes.IRProperty.SplitIncoreOrch) + assert required.contains(passes.IRProperty.HierarchyOutlined) assert required.contains(passes.IRProperty.IncoreTileOps) def test_no_produced_properties(self): diff --git a/tests/ut/ir/transforms/test_outline_hierarchy_scopes.py b/tests/ut/ir/transforms/test_outline_hierarchy_scopes.py index b26ec402c..59e406495 100644 --- a/tests/ut/ir/transforms/test_outline_hierarchy_scopes.py +++ b/tests/ut/ir/transforms/test_outline_hierarchy_scopes.py @@ -147,8 +147,14 @@ def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: After = passes.outline_hierarchy_scopes()(Before) ir.assert_structural_equal(After, Expected) - def test_outline_hierarchy_with_incore_preserved(self): - """Test that InCore scope inside Hierarchy scope is preserved (not outlined by this pass).""" + def test_outline_hierarchy_with_nested_core_group_preserves_core_group(self): + """CORE_GROUP scope nested inside HOST is preserved verbatim in the outlined function. + + outline_hierarchy_scopes only outlines non-CORE_GROUP Hierarchy scopes. + After this pass, the HOST scope is outlined into a new Opaque function + whose body still contains the CORE_GROUP scope unchanged. The + CORE_GROUP scope is later outlined by outline_incore_scopes. + """ @pl.program class Before: @@ -159,23 +165,17 @@ def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: y: pl.Tensor[[64], pl.FP32] = pl.add(x, x) return y - @pl.program - class Expected: - @pl.function(level=pl.Level.HOST, role=pl.Role.Worker) - def main_host_worker_0(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP): - y: pl.Tensor[[64], pl.FP32] = pl.add(x, x) - return y - - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - y: pl.Tensor[[64], pl.FP32] = self.main_host_worker_0(x) - return y - Before = passes.convert_to_ssa()(Before) - Expected = passes.convert_to_ssa()(Expected) After = passes.outline_hierarchy_scopes()(Before) - ir.assert_structural_equal(After, Expected) + # The HOST scope was outlined into an Opaque function; main and the new + # function exist, no InCore function appears yet. + func_types = {gv.name: func.func_type for gv, func in After.functions.items()} + assert "main" in func_types + host_funcs = [n for n in func_types if "host_worker" in n] + assert len(host_funcs) == 1 + assert func_types[host_funcs[0]] == ir.FunctionType.Opaque + # No CORE_GROUP InCore outlining happens in this pass + assert not any(t == ir.FunctionType.InCore for t in func_types.values()) def test_outline_hierarchy_multiple_inputs(self): """Test outlining scope that uses multiple outer variables.""" @@ -292,8 +292,13 @@ def main(self, x: pl.Tensor[[64], pl.FP32], cond: pl.Scalar[pl.BOOL]) -> pl.Tens After = passes.outline_hierarchy_scopes()(Before) ir.assert_structural_equal(After, Expected) - def test_hierarchy_does_not_affect_incore_scopes(self): - """Test that OutlineHierarchyScopes does not outline InCore scopes.""" + def test_hierarchy_preserves_core_group_scopes(self): + """CORE_GROUP hierarchy scopes are NOT outlined by outline_hierarchy_scopes. + + OutlineHierarchyScopes is responsible for non-CORE_GROUP scopes only; + CORE_GROUP scopes survive intact and are outlined into InCore functions + by the subsequent OutlineIncoreScopes pass. + """ @pl.program class Before: @@ -305,7 +310,7 @@ def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: Before = passes.convert_to_ssa()(Before) After = passes.outline_hierarchy_scopes()(Before) - # InCore scopes should remain untouched by the hierarchy pass + # Nothing was outlined; main keeps Opaque, no InCore function appears. ir.assert_structural_equal(After, Before) def test_hierarchy_does_not_affect_cluster_scopes(self): @@ -499,38 +504,6 @@ def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: Reparsed = pl.parse_program(printed) ir.assert_structural_equal(After, Reparsed) - def test_outline_then_incore(self): - """Test hierarchy outlined first, then InCore outlined from inside hierarchy function.""" - - @pl.program - class Before: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.HOST, role=pl.Role.Worker): - with pl.at(level=pl.Level.CORE_GROUP): - y: pl.Tensor[[64], pl.FP32] = pl.add(x, x) - return y - - Before = passes.convert_to_ssa()(Before) - - # Step 1: Outline hierarchy scopes - After1 = passes.outline_hierarchy_scopes()(Before) - - # The outlined hierarchy function should contain the InCore scope - hierarchy_func = After1.get_function("main_host_worker_0") - assert hierarchy_func is not None - assert hierarchy_func.level == ir.Level.HOST - printed1 = After1.as_python() - assert "pl.at(level=pl.Level.CORE_GROUP)" in printed1 - - # Step 2: Outline incore scopes (processes Opaque functions including hierarchy-outlined ones) - After2 = passes.outline_incore_scopes()(After1) - - # The InCore scope should now be outlined from the hierarchy function - incore_func = After2.get_function("main_host_worker_0_incore_0") - assert incore_func is not None - assert incore_func.func_type == ir.FunctionType.InCore - def test_outline_hierarchy_with_alias_level(self): """Test that level aliases (POD = CLUSTER_0) resolve to canonical name.""" @@ -553,7 +526,12 @@ def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: class TestHierarchyOutlinedVerifier: - """Tests for the HierarchyOutlined property verifier.""" + """Tests for the HierarchyOutlined property verifier. + + HierarchyOutlined is jointly established by OutlineHierarchyScopes (handles + non-CORE_GROUP) and OutlineIncoreScopes (handles CORE_GROUP). Verification + only passes once both have run (or once both kinds of scopes are absent). + """ @staticmethod def _hierarchy_outlined_props(): @@ -576,6 +554,7 @@ def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: with ctx: program = passes.convert_to_ssa()(Input) program = passes.outline_hierarchy_scopes()(program) + program = passes.outline_incore_scopes()(program) # Should not throw — no Hierarchy scopes remain passes.verify_properties(self._hierarchy_outlined_props(), program, "test") @@ -600,6 +579,26 @@ def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: with pytest.raises(Exception, match="Hierarchy ScopeStmt"): passes.verify_properties(self._hierarchy_outlined_props(), program, "test") + def test_remaining_core_group_scope_fails_verification(self): + """A surviving CORE_GROUP scope (only OutlineHierarchyScopes ran) fails verification.""" + + @pl.program + class Input: + @pl.function + def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: + with pl.at(level=pl.Level.CORE_GROUP): + y: pl.Tensor[[64], pl.FP32] = pl.add(x, x) + return y + + ctx = passes.PassContext([], passes.VerificationLevel.NONE) + with ctx: + program = passes.convert_to_ssa()(Input) + # outline_hierarchy_scopes alone leaves CORE_GROUP scopes intact + program = passes.outline_hierarchy_scopes()(program) + + with pytest.raises(Exception, match="Hierarchy ScopeStmt"): + passes.verify_properties(self._hierarchy_outlined_props(), program, "test") + def test_program_without_hierarchy_passes_verification(self): """Program that never had Hierarchy scopes passes verification.""" diff --git a/tests/ut/ir/transforms/test_outline_incore_interleaved_ops.py b/tests/ut/ir/transforms/test_outline_incore_interleaved_ops.py deleted file mode 100644 index 1f0e0d8f9..000000000 --- a/tests/ut/ir/transforms/test_outline_incore_interleaved_ops.py +++ /dev/null @@ -1,567 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- - -"""Regression tests for non-parallel code inside auto_incore losing InCore scope. - -Root cause ----------- -InterchangeChunkLoops consumes ``auto_incore`` and wraps each interchanged -parallel chunk body in ``ScopeStmt(InCore)``. However, non-parallel code -(range loops, straight-line ops) that sits *between* parallel chunk loops -inside the same ``auto_incore`` scope is left without an InCore wrapper. - -``WrapNonIncoreStatementsInInCore`` only operates on the direct children of -the ``auto_incore`` body. When the body is a single ``ForStmt`` (e.g. a -``pl.range`` loop) whose body *contains* InCore scopes from the interchanged -parallel chunks, ``ContainsInCoreScope`` returns ``True`` for the entire -``ForStmt``, so the function returns it as-is — leaving non-parallel code -inside the loop body unwrapped. - -Consequence: ``OutlineIncoreScopes`` cannot outline these unwrapped -operations, so they stay in the Orchestration function as bare tensor ops -(including matmul), which downstream passes (ConvertTensorToTileOps, -ExpandMixedKernel, etc.) cannot process correctly. - -This reproduces the issue observed in the Qwen3SingleLayerDecode model where -the MLP gate/up projection matmuls remained in the Orchestration function. -""" - -import pypto.language as pl -import pytest -from pypto import ir, passes - - -def _run_pipeline(program): - """Run prerequisite passes plus interchange + outline. - - This is the full pipeline exercised by these tests: it reproduces the - setup that triggered the original bug (parallel chunks + non-parallel - code inside auto_incore). - """ - program = passes.unroll_loops()(program) - program = passes.convert_to_ssa()(program) - program = passes.flatten_call_expr()(program) - program = passes.split_chunked_loops()(program) - program = passes.interchange_chunk_loops()(program) - program = passes.outline_incore_scopes()(program) - return program - - -class TestNonParallelCodeBetweenChunks: - """Non-parallel code between parallel chunk loops inside auto_incore - must be wrapped in InCore scope so that OutlineIncoreScopes can outline it.""" - - def test_interleaved_scalar_op_gets_incore(self): - """A scalar op between two parallel chunks must get an InCore scope.""" - - @pl.program - class Before: - @pl.function - def main( - self, - x: pl.Tensor[[8, 64], pl.FP32], - ) -> pl.Tensor[[8, 64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for b in pl.range(0, 8, 4): - for i in pl.parallel(4, chunk=2, chunk_policy="leading_full"): - x = pl.tensor.adds(x, 1.0) - y: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.muls(x, 2.0) - for j in pl.parallel(4, chunk=2, chunk_policy="leading_full"): - x = pl.tensor.add(x, y) - return x - - @pl.program - class Expected: - @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN}) - def main_incore_0(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]: - for i1, (x1,) in pl.parallel( - 2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - x2: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.adds(x1, 1.0) - x3: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x2) - return x3 - - @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN}) - def main_incore_1(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]: - y0: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.muls(x0, 2.0) - return y0 - - @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN}) - def main_incore_2( - self, - x0: pl.Tensor[[8, 64], pl.FP32], - y0: pl.Tensor[[8, 64], pl.FP32], - ) -> pl.Tensor[[8, 64], pl.FP32]: - for j1, (x1,) in pl.parallel( - 2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - x2: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.add(x1, y0) - x3: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x2) - return x3 - - @pl.function(type=pl.FunctionType.Orchestration) - def main(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]: - for b0, (x1,) in pl.range(0, 8, 4, init_values=(x0,)): - for i0, (x2,) in pl.parallel( - 2, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - x3: pl.Tensor[[8, 64], pl.FP32] = self.main_incore_0(x2) - x4: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x3) - y0: pl.Tensor[[8, 64], pl.FP32] = self.main_incore_1(x4) - for j0, (x5,) in pl.parallel( - 2, init_values=(x4,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - x6: pl.Tensor[[8, 64], pl.FP32] = self.main_incore_2(x5, y0) - x7: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x6) - x8: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x7) - return x8 - - After = _run_pipeline(Before) - ir.assert_structural_equal(After, Expected, enable_auto_mapping=True) - - def test_interleaved_range_loop_gets_incore(self): - """A range loop between parallel chunks must get an InCore scope. - - This mirrors the Qwen3 MLP pattern: a pl.range() loop containing - matmul sits between two pl.parallel() chunk loops. - """ - - @pl.program - class Before: - @pl.function - def main( - self, - x: pl.Tensor[[8, 64], pl.FP32], - w: pl.Tensor[[64, 64], pl.FP32], - ) -> pl.Tensor[[8, 64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for b in pl.range(0, 8, 4): - for i in pl.parallel(4, chunk=2, chunk_policy="leading_full"): - x = pl.tensor.adds(x, 1.0) - for k in pl.range(2): - x = pl.tensor.matmul(x, w) - for j in pl.parallel(4, chunk=2, chunk_policy="leading_full"): - x = pl.tensor.adds(x, 1.0) - return x - - @pl.program - class Expected: - @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN}) - def main_incore_0(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]: - for i1, (x1,) in pl.parallel( - 2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - x2: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.adds(x1, 1.0) - x3: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x2) - return x3 - - @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN}) - def main_incore_1( - self, - x0: pl.Tensor[[8, 64], pl.FP32], - w0: pl.Tensor[[64, 64], pl.FP32], - ) -> pl.Tensor[[8, 64], pl.FP32]: - for k0, (x1,) in pl.range(2, init_values=(x0,)): - x2: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.matmul( - x1, w0, a_trans=False, b_trans=False, c_matrix_nz=False - ) - x3: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x2) - return x3 - - @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN}) - def main_incore_2(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]: - for j1, (x1,) in pl.parallel( - 2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - x2: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.adds(x1, 1.0) - x3: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x2) - return x3 - - @pl.function(type=pl.FunctionType.Orchestration) - def main( - self, - x0: pl.Tensor[[8, 64], pl.FP32], - w0: pl.Tensor[[64, 64], pl.FP32], - ) -> pl.Tensor[[8, 64], pl.FP32]: - for b0, (x1,) in pl.range(0, 8, 4, init_values=(x0,)): - for i0, (x2,) in pl.parallel( - 2, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - x3: pl.Tensor[[8, 64], pl.FP32] = self.main_incore_0(x2) - x4: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x3) - x5: pl.Tensor[[8, 64], pl.FP32] = self.main_incore_1(x4, w0) - for j0, (x6,) in pl.parallel( - 2, init_values=(x5,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - x7: pl.Tensor[[8, 64], pl.FP32] = self.main_incore_2(x6) - x8: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x7) - x9: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x8) - return x9 - - After = _run_pipeline(Before) - ir.assert_structural_equal(After, Expected, enable_auto_mapping=True) - - def test_all_ops_outlined_end_to_end(self): - """End-to-end: all compute ops inside auto_incore must be outlined. - - Same structure as ``test_interleaved_scalar_op_gets_incore`` — this - test is retained as a stronger end-to-end check (same expected output). - """ - - @pl.program - class Before: - @pl.function - def main( - self, - x: pl.Tensor[[8, 64], pl.FP32], - ) -> pl.Tensor[[8, 64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for b in pl.range(0, 8, 4): - for i in pl.parallel(4, chunk=2, chunk_policy="leading_full"): - x = pl.tensor.adds(x, 1.0) - y: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.muls(x, 2.0) - for j in pl.parallel(4, chunk=2, chunk_policy="leading_full"): - x = pl.tensor.add(x, y) - return x - - @pl.program - class Expected: - @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN}) - def main_incore_0(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]: - for i1, (x1,) in pl.parallel( - 2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - x2: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.adds(x1, 1.0) - x3: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x2) - return x3 - - @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN}) - def main_incore_1(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]: - y0: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.muls(x0, 2.0) - return y0 - - @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN}) - def main_incore_2( - self, - x0: pl.Tensor[[8, 64], pl.FP32], - y0: pl.Tensor[[8, 64], pl.FP32], - ) -> pl.Tensor[[8, 64], pl.FP32]: - for j1, (x1,) in pl.parallel( - 2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - x2: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.add(x1, y0) - x3: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x2) - return x3 - - @pl.function(type=pl.FunctionType.Orchestration) - def main(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]: - for b0, (x1,) in pl.range(0, 8, 4, init_values=(x0,)): - for i0, (x2,) in pl.parallel( - 2, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - x3: pl.Tensor[[8, 64], pl.FP32] = self.main_incore_0(x2) - x4: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x3) - y0: pl.Tensor[[8, 64], pl.FP32] = self.main_incore_1(x4) - for j0, (x5,) in pl.parallel( - 2, init_values=(x4,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - x6: pl.Tensor[[8, 64], pl.FP32] = self.main_incore_2(x5, y0) - x7: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x6) - x8: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x7) - return x8 - - After = _run_pipeline(Before) - ir.assert_structural_equal(After, Expected, enable_auto_mapping=True) - - -class TestNestedForStmtRecursion: - """The fix recurses into ForStmt bodies that contain InCore scopes. - These tests verify the recursion works for deeper nesting and edge cases.""" - - def test_doubly_nested_range_with_interleaved_op(self): - """Non-parallel op inside a doubly nested range loop must get InCore scope.""" - - @pl.program - class Before: - @pl.function - def main( - self, - x: pl.Tensor[[8, 64], pl.FP32], - ) -> pl.Tensor[[8, 64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for b in pl.range(0, 8, 4): - for c in pl.range(2): - for i in pl.parallel(4, chunk=2, chunk_policy="leading_full"): - x = pl.tensor.adds(x, 1.0) - y: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.muls(x, 3.0) - for j in pl.parallel(4, chunk=2, chunk_policy="leading_full"): - x = pl.tensor.add(x, y) - return x - - @pl.program - class Expected: - @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN}) - def main_incore_0(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]: - for i1, (x1,) in pl.parallel( - 2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - x2: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.adds(x1, 1.0) - x3: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x2) - return x3 - - @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN}) - def main_incore_1(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]: - y0: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.muls(x0, 3.0) - return y0 - - @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN}) - def main_incore_2( - self, - x0: pl.Tensor[[8, 64], pl.FP32], - y0: pl.Tensor[[8, 64], pl.FP32], - ) -> pl.Tensor[[8, 64], pl.FP32]: - for j1, (x1,) in pl.parallel( - 2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - x2: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.add(x1, y0) - x3: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x2) - return x3 - - @pl.function(type=pl.FunctionType.Orchestration) - def main(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]: - for b0, (x1,) in pl.range(0, 8, 4, init_values=(x0,)): - for c0, (x2,) in pl.range(2, init_values=(x1,)): - for i0, (x3,) in pl.parallel( - 2, init_values=(x2,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - x4: pl.Tensor[[8, 64], pl.FP32] = self.main_incore_0(x3) - x5: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x4) - y0: pl.Tensor[[8, 64], pl.FP32] = self.main_incore_1(x5) - for j0, (x6,) in pl.parallel( - 2, init_values=(x5,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - x7: pl.Tensor[[8, 64], pl.FP32] = self.main_incore_2(x6, y0) - x8: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x7) - x9: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x8) - x10: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x9) - return x10 - - After = _run_pipeline(Before) - ir.assert_structural_equal(After, Expected, enable_auto_mapping=True) - - def test_single_forstmt_body_with_mixed_children(self): - """auto_incore body is a single ForStmt (not SeqStmts). - - This is the exact trigger for the original bug: ContainsInCoreScope - returns True for the ForStmt, so the old code returned it as-is - without examining its children. - """ - - @pl.program - class Before: - @pl.function - def main( - self, - x: pl.Tensor[[8, 64], pl.FP32], - ) -> pl.Tensor[[8, 64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for b in pl.range(0, 8, 4): - for i in pl.parallel(4, chunk=2, chunk_policy="leading_full"): - x = pl.tensor.adds(x, 1.0) - x = pl.tensor.muls(x, 2.0) - return x - - @pl.program - class Expected: - @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN}) - def main_incore_0(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]: - for i1, (x1,) in pl.parallel( - 2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - x2: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.adds(x1, 1.0) - x3: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x2) - return x3 - - @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN}) - def main_incore_1(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]: - x1: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.muls(x0, 2.0) - return x1 - - @pl.function(type=pl.FunctionType.Orchestration) - def main(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]: - for b0, (x1,) in pl.range(0, 8, 4, init_values=(x0,)): - for i0, (x2,) in pl.parallel( - 2, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - x3: pl.Tensor[[8, 64], pl.FP32] = self.main_incore_0(x2) - x4: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x3) - x5: pl.Tensor[[8, 64], pl.FP32] = self.main_incore_1(x4) - x6: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x5) - return x6 - - After = _run_pipeline(Before) - ir.assert_structural_equal(After, Expected, enable_auto_mapping=True) - - def test_multiple_non_parallel_ops_between_chunks(self): - """Multiple consecutive non-parallel ops between chunks must all be wrapped.""" - - @pl.program - class Before: - @pl.function - def main( - self, - x: pl.Tensor[[8, 64], pl.FP32], - ) -> pl.Tensor[[8, 64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for b in pl.range(0, 8, 4): - for i in pl.parallel(4, chunk=2, chunk_policy="leading_full"): - x = pl.tensor.adds(x, 1.0) - y: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.muls(x, 2.0) - z: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.add(x, y) - x = pl.tensor.muls(z, 0.5) - for j in pl.parallel(4, chunk=2, chunk_policy="leading_full"): - x = pl.tensor.adds(x, 1.0) - return x - - @pl.program - class Expected: - @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN}) - def main_incore_0(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]: - for i1, (x1,) in pl.parallel( - 2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - x2: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.adds(x1, 1.0) - x3: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x2) - return x3 - - @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN}) - def main_incore_1(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]: - y0: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.muls(x0, 2.0) - z0: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.add(x0, y0) - x1: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.muls(z0, 0.5) - return x1 - - @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN}) - def main_incore_2(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]: - for j1, (x1,) in pl.parallel( - 2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - x2: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.adds(x1, 1.0) - x3: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x2) - return x3 - - @pl.function(type=pl.FunctionType.Orchestration) - def main(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]: - for b0, (x1,) in pl.range(0, 8, 4, init_values=(x0,)): - for i0, (x2,) in pl.parallel( - 2, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - x3: pl.Tensor[[8, 64], pl.FP32] = self.main_incore_0(x2) - x4: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x3) - x5: pl.Tensor[[8, 64], pl.FP32] = self.main_incore_1(x4) - for j0, (x6,) in pl.parallel( - 2, init_values=(x5,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - x7: pl.Tensor[[8, 64], pl.FP32] = self.main_incore_2(x6) - x8: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x7) - x9: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x8) - return x9 - - After = _run_pipeline(Before) - ir.assert_structural_equal(After, Expected, enable_auto_mapping=True) - - def test_no_parallel_chunks_no_wrapping(self): - """auto_incore with only non-parallel code (no chunks) should not crash. - - When there are no interchanged parallel chunks, there are no InCore - scopes to trigger recursion. The function should still work correctly — - the whole body becomes a single InCore function. - """ - - @pl.program - class Before: - @pl.function - def main( - self, - x: pl.Tensor[[8, 64], pl.FP32], - ) -> pl.Tensor[[8, 64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for b in pl.range(0, 8, 4): - x = pl.tensor.adds(x, 1.0) - x = pl.tensor.muls(x, 2.0) - return x - - @pl.program - class Expected: - @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN}) - def main_incore_0(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]: - for b0, (x1,) in pl.range(0, 8, 4, init_values=(x0,)): - x2: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.adds(x1, 1.0) - x3: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.muls(x2, 2.0) - x4: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x3) - return x4 - - @pl.function(type=pl.FunctionType.Orchestration) - def main(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]: - x1: pl.Tensor[[8, 64], pl.FP32] = self.main_incore_0(x0) - return x1 - - After = _run_pipeline(Before) - ir.assert_structural_equal(After, Expected, enable_auto_mapping=True) - - -class TestHostSideTailOps: - """Host-side tensor ops may stay in Orchestration after outline.""" - - def test_tail_assemble_after_parallel_chunk_stays_in_orchestration(self): - """A trailing tensor.assemble should remain in the Orchestration function.""" - - @pl.program - class Before: - @pl.function - def main(self, x: pl.Tensor[[4], pl.FP32]) -> pl.Tensor[[8], pl.FP32]: - out_0: pl.Tensor[[8], pl.FP32] = pl.tensor.create( - [8], dtype=pl.FP32, layout=pl.TensorLayout.ND - ) - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.parallel(0, 4, 1, chunk=2, chunk_policy="leading_full"): - x = pl.tensor.adds(x, 1.0) - out_1: pl.Tensor[[8], pl.FP32] = pl.tensor.assemble(out_0, x, [0]) - return out_1 - - @pl.program - class Expected: - @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN}) - def main_incore_0(self, x0: pl.Tensor[[4], pl.FP32]) -> pl.Tensor[[4], pl.FP32]: - for i1, (x1,) in pl.parallel( - 2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - x2: pl.Tensor[[4], pl.FP32] = pl.tensor.adds(x1, 1.0) - x3: pl.Tensor[[4], pl.FP32] = pl.yield_(x2) - return x3 - - @pl.function(type=pl.FunctionType.Orchestration) - def main(self, x0: pl.Tensor[[4], pl.FP32]) -> pl.Tensor[[8], pl.FP32]: - out_0: pl.Tensor[[8], pl.FP32] = pl.tensor.create( - [8], dtype=pl.FP32, layout=pl.TensorLayout.ND - ) - for i0, (x1,) in pl.parallel( - 2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - x2: pl.Tensor[[4], pl.FP32] = self.main_incore_0(x1) - x3: pl.Tensor[[4], pl.FP32] = pl.yield_(x2) - out_1: pl.Tensor[[8], pl.FP32] = pl.tensor.assemble(out_0, x3, [0]) - return out_1 - - After = _run_pipeline(Before) - ir.assert_structural_equal(After, Expected, enable_auto_mapping=True) - - -if __name__ == "__main__": - pytest.main([__file__, "-v"]) diff --git a/tests/ut/ir/transforms/test_outline_incore_scopes.py b/tests/ut/ir/transforms/test_outline_incore_scopes.py index a8015421b..a000f9e1f 100644 --- a/tests/ut/ir/transforms/test_outline_incore_scopes.py +++ b/tests/ut/ir/transforms/test_outline_incore_scopes.py @@ -7,21 +7,24 @@ # See LICENSE in the root of the software repository for the full text of the License. # ----------------------------------------------------------------------------------------------------------- -"""Unit tests for OutlineIncoreScopes pass.""" +"""Unit tests for OutlineIncoreScopes pass. -import re +OutlineIncoreScopes outlines `HierarchyScopeStmt(level=CORE_GROUP)` into +`Function(InCore)` and promotes the parent function from `Opaque` to +`Orchestration`. It runs after OutlineHierarchyScopes, which handles all +non-CORE_GROUP Hierarchy scopes. +""" import pypto.language as pl import pytest from pypto import ir, passes -from pypto.ir.printer import python_print class TestOutlineIncoreScopes: """Test OutlineIncoreScopes pass.""" - def test_outline_simple_incore_scope(self): - """Test outlining a simple InCore scope.""" + def test_outline_simple_core_group_scope(self): + """A single CORE_GROUP scope becomes an InCore function; main is promoted to Orchestration.""" @pl.program class Before: @@ -31,782 +34,156 @@ def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: y: pl.Tensor[[64], pl.FP32] = pl.add(x, x) return y - @pl.program - class Expected: - @pl.function(type=pl.FunctionType.InCore) - def main_incore_0(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - y: pl.Tensor[[64], pl.FP32] = pl.add(x, x) - return y - - @pl.function(type=pl.FunctionType.Orchestration) - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - y: pl.Tensor[[64], pl.FP32] = self.main_incore_0(x) - return y - - # Convert to SSA first (required by outline pass) Before = passes.convert_to_ssa()(Before) - Expected = passes.convert_to_ssa()(Expected) - - # Apply outline pass After = passes.outline_incore_scopes()(Before) - # Should be structurally equal - ir.assert_structural_equal(After, Expected) - - def test_outline_multiple_incore_scopes(self): - """Test outlining multiple InCore scopes in one function.""" - - @pl.program - class Before: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP): - y: pl.Tensor[[64], pl.FP32] = pl.add(x, x) - with pl.at(level=pl.Level.CORE_GROUP): - z: pl.Tensor[[64], pl.FP32] = pl.mul(y, y) - return z - - @pl.program - class Expected: - @pl.function(type=pl.FunctionType.InCore) - def main_incore_0(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - y: pl.Tensor[[64], pl.FP32] = pl.add(x, x) - return y - - @pl.function(type=pl.FunctionType.InCore) - def main_incore_1(self, y: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - z: pl.Tensor[[64], pl.FP32] = pl.mul(y, y) - return z - - @pl.function(type=pl.FunctionType.Orchestration) - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - y: pl.Tensor[[64], pl.FP32] = self.main_incore_0(x) - z: pl.Tensor[[64], pl.FP32] = self.main_incore_1(y) - return z - - # Convert to SSA first - Before = passes.convert_to_ssa()(Before) - Expected = passes.convert_to_ssa()(Expected) - - # Apply outline pass - After = passes.outline_incore_scopes()(Before) + func_types = {gv.name: func.func_type for gv, func in After.functions.items()} + # Parent promoted + assert func_types["main"] == ir.FunctionType.Orchestration + # Exactly one outlined InCore function with "core_group" in its name + incore_funcs = [(n, t) for n, t in func_types.items() if t == ir.FunctionType.InCore] + assert len(incore_funcs) == 1 + assert "core_group" in incore_funcs[0][0] - # Should be structurally equal - ir.assert_structural_equal(After, Expected) + def test_outline_preserves_non_core_group_scopes(self): + """Non-CORE_GROUP Hierarchy scopes are left intact for OutlineHierarchyScopes. - def test_outline_preserves_non_incore_functions(self): - """Test that non-InCore functions are preserved unchanged.""" + Run with verification disabled because OutlineIncoreScopes claims to + produce HierarchyOutlined; a leftover HOST scope (which would normally + have been removed by OutlineHierarchyScopes earlier in the pipeline) + intentionally fails that property — we only care that the pass itself + is a no-op for non-CORE_GROUP scopes. + """ @pl.program class Before: @pl.function - def helper(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - result: pl.Tensor[[64], pl.FP32] = pl.add(x, x) - return result - - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP): - y: pl.Tensor[[64], pl.FP32] = pl.mul(x, x) - return y - - @pl.program - class Expected: - @pl.function - def helper(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - result: pl.Tensor[[64], pl.FP32] = pl.add(x, x) - return result - - @pl.function(type=pl.FunctionType.InCore) - def main_incore_0(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - y: pl.Tensor[[64], pl.FP32] = pl.mul(x, x) - return y - - @pl.function(type=pl.FunctionType.Orchestration) def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - y: pl.Tensor[[64], pl.FP32] = self.main_incore_0(x) + with pl.at(level=pl.Level.HOST, role=pl.Role.Worker): + y: pl.Tensor[[64], pl.FP32] = pl.add(x, x) return y - # Convert to SSA first - Before = passes.convert_to_ssa()(Before) - Expected = passes.convert_to_ssa()(Expected) - - # Apply outline pass - After = passes.outline_incore_scopes()(Before) + with passes.PassContext([], passes.VerificationLevel.NONE): + Before = passes.convert_to_ssa()(Before) + After = passes.outline_incore_scopes()(Before) + # Pass is a no-op — no CORE_GROUP scope present. + ir.assert_structural_equal(After, Before) - # Should be structurally equal - ir.assert_structural_equal(After, Expected) - - def test_outline_scope_with_multiple_inputs(self): - """Test outlining scope that uses multiple outer variables.""" - - @pl.program - class Before: - @pl.function - def main( - self, x: pl.Tensor[[64], pl.FP32], y: pl.Tensor[[64], pl.FP32] - ) -> pl.Tensor[[64], pl.FP32]: - a: pl.Tensor[[64], pl.FP32] = pl.add(x, y) - b: pl.Tensor[[64], pl.FP32] = pl.mul(x, y) - with pl.at(level=pl.Level.CORE_GROUP): - result: pl.Tensor[[64], pl.FP32] = pl.add(a, b) - return result - - @pl.program - class Expected: - @pl.function(type=pl.FunctionType.InCore) - def main_incore_0( - self, a: pl.Tensor[[64], pl.FP32], b: pl.Tensor[[64], pl.FP32] - ) -> pl.Tensor[[64], pl.FP32]: - result: pl.Tensor[[64], pl.FP32] = pl.add(a, b) - return result - - @pl.function(type=pl.FunctionType.Orchestration) - def main( - self, x: pl.Tensor[[64], pl.FP32], y: pl.Tensor[[64], pl.FP32] - ) -> pl.Tensor[[64], pl.FP32]: - a: pl.Tensor[[64], pl.FP32] = pl.add(x, y) - b: pl.Tensor[[64], pl.FP32] = pl.mul(x, y) - result: pl.Tensor[[64], pl.FP32] = self.main_incore_0(a, b) - return result - - Before = passes.convert_to_ssa()(Before) - Expected = passes.convert_to_ssa()(Expected) - After = passes.outline_incore_scopes()(Before) - ir.assert_structural_equal(After, Expected) - - def test_outline_scope_with_multiple_outputs(self): - """Test outlining scope that produces multiple values. - - The Before/After pattern can't express TupleGetItem in the DSL, - so we verify properties directly. - """ + def test_outline_split_propagates_to_incore_function(self): + """`pl.split(...)` on a CORE_GROUP scope is forwarded to the outlined InCore fn.""" @pl.program class Before: @pl.function def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP): + with pl.at( + level=pl.Level.CORE_GROUP, + optimizations=[pl.split(pl.SplitMode.UP_DOWN)], + ): y: pl.Tensor[[64], pl.FP32] = pl.add(x, x) - z: pl.Tensor[[64], pl.FP32] = pl.mul(x, x) - result: pl.Tensor[[64], pl.FP32] = pl.add(y, z) - return result - - @pl.program - class Expected: - @pl.function(type=pl.FunctionType.InCore) - def main_incore_0( - self, x: pl.Tensor[[64], pl.FP32] - ) -> tuple[pl.Tensor[[64], pl.FP32], pl.Tensor[[64], pl.FP32]]: - y: pl.Tensor[[64], pl.FP32] = pl.add(x, x) - z: pl.Tensor[[64], pl.FP32] = pl.mul(x, x) - return (y, z) - - @pl.function(type=pl.FunctionType.Orchestration) - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - ret = self.main_incore_0(x) - y = ret[0] - z = ret[1] - result: pl.Tensor[[64], pl.FP32] = pl.add(y, z) - return result + return y Before = passes.convert_to_ssa()(Before) After = passes.outline_incore_scopes()(Before) + incore_funcs = [f for _, f in After.functions.items() if f.func_type == ir.FunctionType.InCore] + assert len(incore_funcs) == 1 + # `split` attr round-trips as the SplitMode's underlying int value. + attrs = dict(incore_funcs[0].attrs) + assert attrs.get("split") == ir.SplitMode.UP_DOWN.value - ir.assert_structural_equal(After, Expected) - - def test_nested_incore_scopes_rejected_by_verifier(self): - """Nested InCore scopes are rejected by the NoNestedInCore structural verifier.""" + def test_pipeline_order_outlines_nested_core_group(self): + """Hierarchy then Incore outlining cleanly handles a CORE_GROUP nested inside HOST.""" @pl.program class Before: @pl.function def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP): - y: pl.Tensor[[64], pl.FP32] = pl.add(x, x) + with pl.at(level=pl.Level.HOST, role=pl.Role.Worker): with pl.at(level=pl.Level.CORE_GROUP): - z: pl.Tensor[[64], pl.FP32] = pl.mul(y, y) - return z - - # Verify directly (no pass pipeline) — nested InCore is a structural invariant violation - props = passes.IRPropertySet() - props.insert(passes.IRProperty.NoNestedInCore) - diagnostics = passes.PropertyVerifierRegistry.verify(props, Before) - errors = [d for d in diagnostics if d.severity == passes.DiagnosticSeverity.Error] - assert len(errors) >= 1 - assert "Nested InCore scope" in errors[0].message - - def test_outline_scope_with_single_input_single_output(self): - """Test outlining scope with simple single input/output.""" - - @pl.program - class Before: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - a: pl.Tensor[[64], pl.FP32] = pl.add(x, x) - with pl.at(level=pl.Level.CORE_GROUP): - y: pl.Tensor[[64], pl.FP32] = pl.mul(a, a) - result: pl.Tensor[[64], pl.FP32] = pl.add(y, y) - return result - - @pl.program - class Expected: - @pl.function(type=pl.FunctionType.InCore) - def main_incore_0(self, a: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - y: pl.Tensor[[64], pl.FP32] = pl.mul(a, a) + y: pl.Tensor[[64], pl.FP32] = pl.add(x, x) return y - @pl.function(type=pl.FunctionType.Orchestration) - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - a: pl.Tensor[[64], pl.FP32] = pl.add(x, x) - y: pl.Tensor[[64], pl.FP32] = self.main_incore_0(a) - result: pl.Tensor[[64], pl.FP32] = pl.add(y, y) - return result + program = passes.convert_to_ssa()(Before) + program = passes.outline_hierarchy_scopes()(program) + program = passes.outline_incore_scopes()(program) - Before = passes.convert_to_ssa()(Before) - Expected = passes.convert_to_ssa()(Expected) - After = passes.outline_incore_scopes()(Before) - ir.assert_structural_equal(After, Expected) + func_types = {gv.name: func.func_type for gv, func in program.functions.items()} + # The inner HOST function (which originally wrapped the CORE_GROUP scope) + # must have been promoted to Orchestration when its CORE_GROUP child got + # outlined. Distinguish it from the further-outlined CORE_GROUP function + # (whose name extends `main_host_worker_…` with `_core_group_…`) by + # filtering out names that *also* contain `core_group`. + host_only_funcs = [n for n in func_types if "host_worker" in n and "core_group" not in n] + assert len(host_only_funcs) == 1 + assert func_types[host_only_funcs[0]] == ir.FunctionType.Orchestration + # An InCore function exists. + assert any(t == ir.FunctionType.InCore for t in func_types.values()) + # main itself (which only contained the HOST scope, not a CORE_GROUP + # directly) stays Opaque. + assert func_types["main"] == ir.FunctionType.Opaque - def test_outline_multiple_functions_with_scopes(self): - """Test outlining scopes in multiple functions (independent numbering).""" + def test_no_core_group_passthrough(self): + """Functions without CORE_GROUP scopes pass through unchanged.""" @pl.program class Before: @pl.function - def func1(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP): - y: pl.Tensor[[64], pl.FP32] = pl.add(x, x) - return y - - @pl.function - def func2(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP): - y: pl.Tensor[[64], pl.FP32] = pl.mul(x, x) - return y - - @pl.program - class Expected: - @pl.function(type=pl.FunctionType.InCore) - def func1_incore_0(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: + def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: y: pl.Tensor[[64], pl.FP32] = pl.add(x, x) return y - @pl.function(type=pl.FunctionType.Orchestration) - def func1(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - y: pl.Tensor[[64], pl.FP32] = self.func1_incore_0(x) - return y - - @pl.function(type=pl.FunctionType.InCore) - def func2_incore_0(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - y: pl.Tensor[[64], pl.FP32] = pl.mul(x, x) - return y - - @pl.function(type=pl.FunctionType.Orchestration) - def func2(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - y: pl.Tensor[[64], pl.FP32] = self.func2_incore_0(x) - return y - Before = passes.convert_to_ssa()(Before) - Expected = passes.convert_to_ssa()(Expected) After = passes.outline_incore_scopes()(Before) - ir.assert_structural_equal(After, Expected) + ir.assert_structural_equal(After, Before) - def test_outline_scope_in_control_flow(self): - """Test outlining scope inside conditional statement.""" + def test_outline_skips_non_opaque_functions(self): + """Already-typed (InCore/Orchestration/...) functions are not touched.""" @pl.program class Before: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32], cond: pl.Scalar[pl.BOOL]) -> pl.Tensor[[64], pl.FP32]: - if cond: - with pl.at(level=pl.Level.CORE_GROUP): - y: pl.Tensor[[64], pl.FP32] = pl.add(x, x) # type: ignore[no-redef] - else: - y: pl.Tensor[[64], pl.FP32] = pl.mul(x, x) # type: ignore[no-redef,unreachable] - return y - - @pl.program - class Expected: @pl.function(type=pl.FunctionType.InCore) - def main_incore_0(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: + def compute(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: y: pl.Tensor[[64], pl.FP32] = pl.add(x, x) return y - @pl.function(type=pl.FunctionType.Orchestration) - def main(self, x: pl.Tensor[[64], pl.FP32], cond: pl.Scalar[pl.BOOL]) -> pl.Tensor[[64], pl.FP32]: - if cond: - y: pl.Tensor[[64], pl.FP32] = self.main_incore_0(x) # type: ignore[no-redef] - else: - y: pl.Tensor[[64], pl.FP32] = pl.mul(x, x) # type: ignore[no-redef,unreachable] - return y - - Before = passes.convert_to_ssa()(Before) - Expected = passes.convert_to_ssa()(Expected) - After = passes.outline_incore_scopes()(Before) - ir.assert_structural_equal(After, Expected) - - def test_outline_incore_with_if_yield(self): - """Test outline_incore_scopes with IfStmt containing unannotated yields (issue #233).""" - - @pl.program - class Before: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32], cond: pl.Scalar[pl.BOOL]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP): - if cond: - y: pl.Tensor[[64], pl.FP32] = pl.add(x, x) - z = pl.yield_(y) # Unannotated - should infer type - else: - y2: pl.Tensor[[64], pl.FP32] = pl.mul(x, x) - z = pl.yield_(y2) - return z - - Before = passes.convert_to_ssa()(Before) - After = passes.outline_incore_scopes()(Before) - - printed = After.as_python() - # The outlined incore function should have correct return type, not Tensor[[1], INT32] - assert "Tensor[[1], pl.INT32]" not in printed - assert "Tensor[[64], pl.FP32]" in printed - - def test_outline_scope_with_intermediate_computation(self): - """Test outlining scope with computation before, inside, and after.""" - - @pl.program - class Before: @pl.function def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - a: pl.Tensor[[64], pl.FP32] = pl.add(x, x) - b: pl.Tensor[[64], pl.FP32] = pl.mul(a, a) - with pl.at(level=pl.Level.CORE_GROUP): - c: pl.Tensor[[64], pl.FP32] = pl.add(b, b) - d: pl.Tensor[[64], pl.FP32] = pl.mul(c, c) - e: pl.Tensor[[64], pl.FP32] = pl.add(d, d) - return e - - @pl.program - class Expected: - @pl.function(type=pl.FunctionType.InCore) - def main_incore_0(self, b: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - c: pl.Tensor[[64], pl.FP32] = pl.add(b, b) - d: pl.Tensor[[64], pl.FP32] = pl.mul(c, c) - return d - - @pl.function(type=pl.FunctionType.Orchestration) - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - a: pl.Tensor[[64], pl.FP32] = pl.add(x, x) - b: pl.Tensor[[64], pl.FP32] = pl.mul(a, a) - d: pl.Tensor[[64], pl.FP32] = self.main_incore_0(b) - e: pl.Tensor[[64], pl.FP32] = pl.add(d, d) - return e - - Before = passes.convert_to_ssa()(Before) - Expected = passes.convert_to_ssa()(Expected) - After = passes.outline_incore_scopes()(Before) - ir.assert_structural_equal(After, Expected) - - def test_outline_scope_with_store_only_outputs(self): - """Test outlining scope where the only outputs are store targets. - - When an InCore scope only writes to external tensors via tile.store - (no new variable definitions used after the scope), the store targets - must be recognised as outputs and returned. - """ - - @pl.program - class Before: - @pl.function - def main(self, x: pl.Tensor[[16, 128], pl.FP32]) -> pl.Tensor[[16, 128], pl.FP32]: - buf: pl.Tensor[[16, 128], pl.FP32] = pl.create_tensor([16, 128], dtype=pl.FP32) with pl.at(level=pl.Level.CORE_GROUP): - tile = pl.tile.full([16, 128], dtype=pl.FP32, value=0.0) - pl.store(tile, [0, 0], buf) - result: pl.Tensor[[16, 128], pl.FP32] = pl.add(buf, x) - return result - - Before = passes.convert_to_ssa()(Before) - After = passes.outline_incore_scopes()(Before) - - printed = After.as_python() - # The outlined InCore function should return buf (store target) - assert "return buf" in printed or "return buf_0" in printed - # The orchestration should receive the return value - assert "main_incore_0(" in printed - - def test_outline_scope_with_multiple_store_targets(self): - """Test outlining scope with multiple store targets as outputs. - - Multiple external tensors modified via tile.store should all appear - as return values of the outlined function. - """ - - @pl.program - class Before: - @pl.function - def main(self, x: pl.Tensor[[16, 128], pl.FP32]) -> pl.Tensor[[16, 128], pl.FP32]: - buf_a: pl.Tensor[[16, 128], pl.FP32] = pl.create_tensor([16, 128], dtype=pl.FP32) - buf_b: pl.Tensor[[16, 1], pl.FP32] = pl.create_tensor([16, 1], dtype=pl.FP32) - with pl.at(level=pl.Level.CORE_GROUP): - tile_a = pl.tile.full([16, 128], dtype=pl.FP32, value=0.0) - tile_b = pl.tile.full([16, 1], dtype=pl.FP32, value=0.0) - pl.store(tile_a, [0, 0], buf_a) - pl.store(tile_b, [0, 0], buf_b) - result: pl.Tensor[[16, 128], pl.FP32] = pl.add(buf_a, x) - return result + y: pl.Tensor[[64], pl.FP32] = pl.add(x, x) + return y Before = passes.convert_to_ssa()(Before) After = passes.outline_incore_scopes()(Before) - printed = After.as_python() - # Both store targets should appear as outputs - assert "main_incore_0(" in printed - # The InCore function should have return statement - assert ( - "return" in printed.split("@pl.function(type=pl.FunctionType.InCore)")[1].split("@pl.function")[0] - ) + compute = After.get_function("compute") + assert compute is not None + assert compute.func_type == ir.FunctionType.InCore - def test_outline_scope_with_loop_carried_init_values(self): - """Test outlining scope where inner loop references outer loop-carried variable via init_values. + main = After.get_function("main") + assert main is not None + # main got promoted because its CORE_GROUP scope was outlined. + assert main.func_type == ir.FunctionType.Orchestration - Regression test for issue #369: OutlineIncoreScopes failed to include - outer loop-carried variables as incore function parameters when they - appeared only inside IterArg.initValue_ expressions. - """ + def test_multiple_core_group_scopes_in_one_function(self): + """Two sibling CORE_GROUP scopes both get outlined; parent promoted once.""" @pl.program class Before: - @pl.function - def main( - self, x: pl.Tensor[[64], pl.FP32], y: pl.Tensor[[64], pl.FP32] - ) -> pl.Tensor[[64], pl.FP32]: - for i, (acc,) in pl.range(3, init_values=(x,)): - with pl.at(level=pl.Level.CORE_GROUP): - for j, (inner,) in pl.range(2, init_values=(acc,)): - updated: pl.Tensor[[64], pl.FP32] = pl.add(inner, y) - inner_rv = pl.yield_(updated) - acc_rv = pl.yield_(inner_rv) - return acc_rv - - Before = passes.convert_to_ssa()(Before) - After = passes.outline_incore_scopes()(Before) - - printed = After.as_python() - incore_section = printed.split("@pl.function(type=pl.FunctionType.InCore)")[1].split("@pl.function")[ - 0 - ] - # Extract parameters between "def ...(self, ...)" — handle multiline signatures - param_match = re.search(r"def \w+\((.*?)\)\s*->", incore_section, re.DOTALL) - assert param_match is not None - incore_params = param_match.group(1) - orch_section = printed.split("@pl.function(type=pl.FunctionType.Orchestration)")[1] - - assert "acc" in incore_params, ( - "outer loop-carried variable 'acc' must be a parameter of the outlined function" - ) - assert "main_incore_0" in orch_section and "acc" in orch_section, ( - "orchestration must pass 'acc' to the outlined function" - ) - - def test_outline_scope_does_not_capture_outer_init_value(self): - """Outer loop's init value must NOT become a parameter of the outlined incore function. - - When an incore scope uses a loop-carried variable (IterArg) from an - outer ForStmt, only the IterArg itself should be captured as a - parameter, not its initValue_ expression. - """ - - @pl.program - class Before: - @pl.function - def main( - self, init: pl.Tensor[[64], pl.FP32], y: pl.Tensor[[64], pl.FP32] - ) -> pl.Tensor[[64], pl.FP32]: - for sb, (acc,) in pl.range(4, init_values=(init,)): - with pl.at(level=pl.Level.CORE_GROUP): - result: pl.Tensor[[64], pl.FP32] = pl.add(acc, y) - acc_rv = pl.yield_(result) - return acc_rv - - Before = passes.convert_to_ssa()(Before) - After = passes.outline_incore_scopes()(Before) - - printed = After.as_python() - incore_section = printed.split("@pl.function(type=pl.FunctionType.InCore)")[1].split("@pl.function")[ - 0 - ] - # Extract parameters — handle multiline signatures from ruff formatting - param_match = re.search(r"def \w+\((.*?)\)\s*->", incore_section, re.DOTALL) - assert param_match is not None - incore_params = param_match.group(1) - - assert "acc" in incore_params, "loop-carried 'acc' must be a parameter" - assert "init" not in incore_params, ( - "outer loop's init value 'init' must NOT be a parameter of the incore function" - ) - - -class TestSplitIncoreOrchVerifier: - """Regression tests for the SplitIncoreOrch property verifier.""" - - def _build_outlined_program(self, input_program): - """Run convert_to_ssa + outline_incore_scopes (no verification).""" - ctx = passes.PassContext([], passes.VerificationLevel.NONE) - with ctx: - program = passes.convert_to_ssa()(input_program) - program = passes.outline_incore_scopes()(program) - return program - - @staticmethod - def _split_incore_orch_props(): - ps = passes.IRPropertySet() - ps.insert(passes.IRProperty.SplitIncoreOrch) - return ps - - def test_clean_orchestration_passes_verification(self): - """Outlined program with all compute in InCore passes property verification.""" - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP): - y: pl.Tensor[[64], pl.FP32] = pl.add(x, x) - return y - - After = self._build_outlined_program(Input) - # Should not throw — no InCore scopes remain, no errors - passes.verify_properties(self._split_incore_orch_props(), After, "test") - - def test_remaining_incore_scope_fails_verification(self): - """Leftover InCore ScopeStmt in non-InCore function causes verification failure.""" - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP): - y: pl.Tensor[[64], pl.FP32] = pl.add(x, x) - return y - - # Don't outline — just convert to SSA, leaving InCore scope intact - ctx = passes.PassContext([], passes.VerificationLevel.NONE) - with ctx: - program = passes.convert_to_ssa()(Input) - - # verify_properties should throw because InCore scope remains in Opaque function - with pytest.raises(Exception, match="InCore ScopeStmt"): - passes.verify_properties(self._split_incore_orch_props(), program, "test") - - def test_compute_op_in_orchestration_does_not_fail(self): - """Compute tensor op in Orchestration produces warning (not error), verification passes.""" - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - a: pl.Tensor[[64], pl.FP32] = pl.add(x, x) - with pl.at(level=pl.Level.CORE_GROUP): - y: pl.Tensor[[64], pl.FP32] = pl.mul(a, a) - return y - - After = self._build_outlined_program(Input) - # Orchestration has tensor.add — but it's a warning, not an error - # verify_properties should NOT throw - passes.verify_properties(self._split_incore_orch_props(), After, "test") - - def test_outline_does_not_throw_for_clean_program(self): - """Running outline_incore_scopes on a clean program does not throw.""" - - @pl.program - class Input: @pl.function def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: with pl.at(level=pl.Level.CORE_GROUP): y: pl.Tensor[[64], pl.FP32] = pl.add(x, x) - return y - - # Run with full verification enabled — should not throw - program = passes.convert_to_ssa()(Input) - passes.outline_incore_scopes()(program) - - def test_outline_with_compute_outside_incore_verification_passes(self): - """Compute ops outside incore in explicit pl.incore() usage: verification passes (warning only).""" - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - a: pl.Tensor[[64], pl.FP32] = pl.add(x, x) with pl.at(level=pl.Level.CORE_GROUP): - y: pl.Tensor[[64], pl.FP32] = pl.mul(a, a) - result: pl.Tensor[[64], pl.FP32] = pl.add(y, y) - return result - - # Run with full verification — should pass despite compute ops in orchestration - program = passes.convert_to_ssa()(Input) - After = passes.outline_incore_scopes()(program) - - # Verify the outlined program still has the expected structure - orch_funcs = [f for f in After.functions.values() if f.func_type == ir.FunctionType.Orchestration] - incore_funcs = [f for f in After.functions.values() if f.func_type == ir.FunctionType.InCore] - assert len(orch_funcs) == 1 - assert len(incore_funcs) == 1 - - def test_full_pipeline_with_verification_passes(self): - """Full pipeline with auto_incore: no compute ops leak into Orchestration.""" - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - x = pl.add(x, 1.0) - for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, 2.0) - return x - - # Run the full pipeline with verification enabled — should not throw - program = passes.unroll_loops()(Input) - program = passes.convert_to_ssa()(program) - program = passes.flatten_call_expr()(program) - program = passes.split_chunked_loops()(program) - program = passes.interchange_chunk_loops()(program) - program = passes.outline_incore_scopes()(program) - - # Verify no compute tensor ops in orchestration - for func in program.functions.values(): - if func.func_type == ir.FunctionType.Orchestration: - func_str = python_print(func) - assert "tensor.add" not in func_str - - -class TestOutlineNamedIncoreScopes: - """Test OutlineIncoreScopes pass with user-provided scope names.""" - - def test_outline_named_incore_scope(self): - """Test that user-provided name is used for the outlined function.""" - - @pl.program - class Before: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, name_hint="fused_add"): - y: pl.Tensor[[64], pl.FP32] = pl.add(x, x) - return y - - @pl.program - class Expected: - @pl.function(type=pl.FunctionType.InCore) - def fused_add(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - y: pl.Tensor[[64], pl.FP32] = pl.add(x, x) - return y - - @pl.function(type=pl.FunctionType.Orchestration) - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - y: pl.Tensor[[64], pl.FP32] = self.fused_add(x) - return y - - Before = passes.convert_to_ssa()(Before) - Expected = passes.convert_to_ssa()(Expected) - After = passes.outline_incore_scopes()(Before) - ir.assert_structural_equal(After, Expected) - - def test_outline_mixed_named_and_unnamed_scopes(self): - """Test that unnamed scopes still get auto-generated names when mixed with named scopes.""" - - @pl.program - class Before: - @pl.function - def main( - self, - x: pl.Tensor[[64], pl.FP32], - y: pl.Tensor[[64], pl.FP32], - ) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, name_hint="first_kernel"): - a: pl.Tensor[[64], pl.FP32] = pl.add(x, x) - with pl.at(level=pl.Level.CORE_GROUP): - b: pl.Tensor[[64], pl.FP32] = pl.add(y, a) - return b - - @pl.program - class Expected: - @pl.function(type=pl.FunctionType.InCore) - def first_kernel(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - a: pl.Tensor[[64], pl.FP32] = pl.add(x, x) - return a - - @pl.function(type=pl.FunctionType.InCore) - def main_incore_1( - self, - y: pl.Tensor[[64], pl.FP32], - a: pl.Tensor[[64], pl.FP32], - ) -> pl.Tensor[[64], pl.FP32]: - b: pl.Tensor[[64], pl.FP32] = pl.add(y, a) - return b - - @pl.function(type=pl.FunctionType.Orchestration) - def main( - self, - x: pl.Tensor[[64], pl.FP32], - y: pl.Tensor[[64], pl.FP32], - ) -> pl.Tensor[[64], pl.FP32]: - a: pl.Tensor[[64], pl.FP32] = self.first_kernel(x) - b: pl.Tensor[[64], pl.FP32] = self.main_incore_1(y, a) - return b + z: pl.Tensor[[64], pl.FP32] = pl.mul(y, y) + return z Before = passes.convert_to_ssa()(Before) - Expected = passes.convert_to_ssa()(Expected) After = passes.outline_incore_scopes()(Before) - ir.assert_structural_equal(After, Expected) - def test_outline_duplicate_name_hint_auto_dedup(self): - """Test that duplicate name_hints are auto-deduplicated.""" - - @pl.program - class Before: - @pl.function - def main( - self, - x: pl.Tensor[[64], pl.FP32], - y: pl.Tensor[[64], pl.FP32], - ) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, name_hint="my_kernel"): - a: pl.Tensor[[64], pl.FP32] = pl.add(x, x) - with pl.at(level=pl.Level.CORE_GROUP, name_hint="my_kernel"): - b: pl.Tensor[[64], pl.FP32] = pl.add(y, a) - return b - - @pl.program - class Expected: - @pl.function(type=pl.FunctionType.InCore) - def my_kernel(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - a: pl.Tensor[[64], pl.FP32] = pl.add(x, x) - return a - - @pl.function(type=pl.FunctionType.InCore) - def my_kernel_0( - self, - y: pl.Tensor[[64], pl.FP32], - a: pl.Tensor[[64], pl.FP32], - ) -> pl.Tensor[[64], pl.FP32]: - b: pl.Tensor[[64], pl.FP32] = pl.add(y, a) - return b - - @pl.function(type=pl.FunctionType.Orchestration) - def main( - self, - x: pl.Tensor[[64], pl.FP32], - y: pl.Tensor[[64], pl.FP32], - ) -> pl.Tensor[[64], pl.FP32]: - a: pl.Tensor[[64], pl.FP32] = self.my_kernel(x) - b: pl.Tensor[[64], pl.FP32] = self.my_kernel_0(y, a) - return b - - Before = passes.convert_to_ssa()(Before) - Expected = passes.convert_to_ssa()(Expected) - After = passes.outline_incore_scopes()(Before) - ir.assert_structural_equal(After, Expected) + func_types = {gv.name: func.func_type for gv, func in After.functions.items()} + assert func_types["main"] == ir.FunctionType.Orchestration + incore_count = sum(1 for t in func_types.values() if t == ir.FunctionType.InCore) + assert incore_count == 2 if __name__ == "__main__": diff --git a/tests/ut/ir/transforms/test_pass_manager.py b/tests/ut/ir/transforms/test_pass_manager.py index 4065083b0..a04af594d 100644 --- a/tests/ut/ir/transforms/test_pass_manager.py +++ b/tests/ut/ir/transforms/test_pass_manager.py @@ -17,8 +17,6 @@ from pypto.backend import BackendType TENSOR_ONLY_PASSES = [ - "SplitChunkedLoops", - "InterchangeChunkLoops", "OutlineHierarchyScopes", "OutlineIncoreScopes", "OutlineClusterScopes", diff --git a/tests/ut/ir/transforms/test_pass_pipeline.py b/tests/ut/ir/transforms/test_pass_pipeline.py index cd73b407c..da7c23572 100644 --- a/tests/ut/ir/transforms/test_pass_pipeline.py +++ b/tests/ut/ir/transforms/test_pass_pipeline.py @@ -161,22 +161,22 @@ def test_before_mode_catches_false_ssa_claim(self): # Same Var assigned twice — genuine SSA violation program = _make_ssa_violating_program() with pytest.raises(Exception, match="Pre-verification failed"): - passes.outline_incore_scopes()(program) + passes.outline_hierarchy_scopes()(program) def test_before_mode_succeeds_when_property_holds(self): """BEFORE mode passes when the required property actually holds.""" with passes.PassContext([passes.VerificationInstrument(passes.VerificationMode.BEFORE)]): program = _make_non_ssa_program() program = passes.convert_to_ssa()(program) - result = passes.outline_incore_scopes()(program) + result = passes.outline_hierarchy_scopes()(program) assert result is not None def test_empty_context_disables_verification(self): """Empty instrument list overrides conftest's verification context.""" with passes.PassContext([]): - # OutlineIncoreScopes requires SSAForm, but empty context = no check + # OutlineHierarchyScopes requires SSAForm, but empty context = no check program = _make_non_ssa_program() - result = passes.outline_incore_scopes()(program) + result = passes.outline_hierarchy_scopes()(program) assert result is not None def test_before_and_after_succeeds_on_valid_pipeline(self): @@ -193,7 +193,7 @@ def test_before_and_after_catches_pre_violation(self): # Same Var assigned twice — genuine SSA violation program = _make_ssa_violating_program() with pytest.raises(Exception, match="Pre-verification failed"): - passes.outline_incore_scopes()(program) + passes.outline_hierarchy_scopes()(program) def test_pipeline_with_context(self): """PassPipeline respects active PassContext instruments.""" diff --git a/tests/ut/ir/transforms/test_split_chunked_loops.py b/tests/ut/ir/transforms/test_split_chunked_loops.py deleted file mode 100644 index a4a0e9d9f..000000000 --- a/tests/ut/ir/transforms/test_split_chunked_loops.py +++ /dev/null @@ -1,1423 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- - -"""Unit tests for SplitChunkedLoops pass.""" - -import re -from typing import cast - -import pypto.language as pl -import pytest -from pypto import ir, passes -from pypto.ir.printer import python_print - - -def _prepare_for_split(program): - """Run prerequisite passes to produce SSA input for SplitChunkedLoops.""" - program = passes.unroll_loops()(program) - program = passes.convert_to_ssa()(program) - program = passes.flatten_call_expr()(program) - return program - - -def _top_level_stmts(program: ir.Program) -> list[ir.Stmt]: - """Return the first function's top-level statements.""" - func = list(program.functions.values())[0] - return list(cast(ir.SeqStmts, func.body).stmts) - - -def _body_stmts(stmt: ir.Stmt) -> list[ir.Stmt]: - """Return child statements from a SeqStmts body.""" - return list(cast(ir.SeqStmts, stmt).stmts) - - -def _normalize_expected(program): - """Normalize Expected IR structure to match pass pipeline output. - - The DSL-constructed Expected programs have a different statement nesting - than the pass pipeline output. This applies the same structural - normalization so assert_structural_equal can compare them. - """ - return passes.normalize_stmt_structure()(program) - - -class TestBasicChunking: - """Tests for basic loop chunking with SSA iter_args propagation.""" - - def test_divisible_chunk(self): - """Chunk a loop where trip_count is divisible by chunk_size.""" - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.range(0, 10, 1, chunk=5, chunk_policy="leading_full"): - x = pl.add(x, 1.0) - return x - - Before = _prepare_for_split(Input) - After = passes.split_chunked_loops()(Before) - - @pl.program - class Expected: - @pl.function(strict_ssa=True) - def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i_0_out, (x_iter_1_outer,) in pl.range( - 0, 2, 1, init_values=(x_0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - for i_0_in, (x_iter_1_inner,) in pl.range( - 0, - 5, - 1, - init_values=(x_iter_1_outer,), - attrs={"loop_origin": pl.LoopOrigin.ChunkInner}, - ): - x_3: pl.Tensor[[64], pl.FP32] = pl.add(x_iter_1_inner, 1.0) - x_iter_1_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3) - x_iter_1_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_iter_1_inner_rv) - return x_iter_1_outer_rv - - ir.assert_structural_equal(After, _normalize_expected(Expected)) - - def test_non_divisible_chunk(self): - """Chunk a loop where trip_count is NOT divisible by chunk_size.""" - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.range(0, 7, 1, chunk=5, chunk_policy="leading_full"): - x = pl.add(x, 1.0) - return x - - Before = _prepare_for_split(Input) - After = passes.split_chunked_loops()(Before) - - @pl.program - class Expected: - @pl.function(strict_ssa=True) - def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i_0_out, (x_iter_1_outer,) in pl.range( - 0, 1, 1, init_values=(x_0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): # noqa: E501 - for i_0_in, (x_iter_1_inner,) in pl.range( - 0, - 5, - 1, - init_values=(x_iter_1_outer,), - attrs={"loop_origin": pl.LoopOrigin.ChunkInner}, - ): # noqa: E501 - x_3: pl.Tensor[[64], pl.FP32] = pl.add(x_iter_1_inner, 1.0) - x_iter_1_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3) - x_iter_1_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_iter_1_inner_rv) - for i_0_rem, (x_iter_1_rem,) in pl.range( - 0, - 2, - 1, - init_values=(x_iter_1_outer_rv,), - attrs={"loop_origin": pl.LoopOrigin.ChunkRemainder}, - ): # noqa: E501 - x_3_f: pl.Tensor[[64], pl.FP32] = pl.add(x_iter_1_rem, 1.0) - x_iter_1_rem_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3_f) - return x_iter_1_rem_rv - - ir.assert_structural_equal(After, _normalize_expected(Expected)) - - def test_single_chunk(self): - """Chunk a loop where trip_count equals chunk_size.""" - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.range(0, 5, 1, chunk=5, chunk_policy="leading_full"): - x = pl.add(x, 1.0) - return x - - Before = _prepare_for_split(Input) - After = passes.split_chunked_loops()(Before) - - @pl.program - class Expected: - @pl.function(strict_ssa=True) - def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i_0_out, (x_iter_1_outer,) in pl.range( - 0, 1, 1, init_values=(x_0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - for i_0_in, (x_iter_1_inner,) in pl.range( - 0, - 5, - 1, - init_values=(x_iter_1_outer,), - attrs={"loop_origin": pl.LoopOrigin.ChunkInner}, - ): - x_3: pl.Tensor[[64], pl.FP32] = pl.add(x_iter_1_inner, 1.0) - x_iter_1_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3) - x_iter_1_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_iter_1_inner_rv) - return x_iter_1_outer_rv - - ir.assert_structural_equal(After, _normalize_expected(Expected)) - - -class TestChunkingWithStep: - """Tests for chunking with non-unit step.""" - - def test_step_2(self): - """Chunk with step=2: range(0, 20, 2, chunk=5) -> trip_count=10.""" - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.range(0, 20, 2, chunk=5, chunk_policy="leading_full"): - x = pl.add(x, 1.0) - return x - - Before = _prepare_for_split(Input) - After = passes.split_chunked_loops()(Before) - - @pl.program - class Expected: - @pl.function(strict_ssa=True) - def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i_0_out, (x_iter_1_outer,) in pl.range( - 0, 2, 1, init_values=(x_0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - for i_0_in, (x_iter_1_inner,) in pl.range( - 0, - 5, - 1, - init_values=(x_iter_1_outer,), - attrs={"loop_origin": pl.LoopOrigin.ChunkInner}, - ): - x_3: pl.Tensor[[64], pl.FP32] = pl.add(x_iter_1_inner, 1.0) - x_iter_1_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3) - x_iter_1_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_iter_1_inner_rv) - return x_iter_1_outer_rv - - ir.assert_structural_equal(After, _normalize_expected(Expected)) - - def test_chunk_all_remainder(self): - """Chunk where trip_count < chunk_size -> only remainder loop.""" - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.range(0, 3, 1, chunk=5, chunk_policy="leading_full"): - x = pl.add(x, 1.0) - return x - - Before = _prepare_for_split(Input) - After = passes.split_chunked_loops()(Before) - - @pl.program - class Expected: - @pl.function(strict_ssa=True) - def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i_0_rem, (x_iter_1_rem,) in pl.range( - 0, 3, 1, init_values=(x_0,), attrs={"loop_origin": pl.LoopOrigin.ChunkRemainder} - ): - x_3: pl.Tensor[[64], pl.FP32] = pl.add(x_iter_1_rem, 1.0) - x_iter_1_rem_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3) - return x_iter_1_rem_rv - - ir.assert_structural_equal(After, _normalize_expected(Expected)) - - -class TestChunkingWithKind: - """Tests for chunking with different loop kinds.""" - - def test_parallel_chunk(self): - """Chunk a parallel loop: both inner and outer loops should be Parallel.""" - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, 1.0) - return x - - Before = _prepare_for_split(Input) - After = passes.split_chunked_loops()(Before) - - @pl.program - class Expected: - @pl.function(strict_ssa=True) - def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i_0_out, (x_iter_1_outer,) in pl.parallel( - 0, 2, 1, init_values=(x_0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - for i_0_in, (x_iter_1_inner,) in pl.parallel( - 0, - 4, - 1, - init_values=(x_iter_1_outer,), - attrs={"loop_origin": pl.LoopOrigin.ChunkInner}, - ): - x_3: pl.Tensor[[64], pl.FP32] = pl.add(x_iter_1_inner, 1.0) - x_iter_1_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3) - x_iter_1_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_iter_1_inner_rv) - return x_iter_1_outer_rv - - ir.assert_structural_equal(After, _normalize_expected(Expected)) - - @pytest.mark.filterwarnings("ignore:.*RoundtripInstrument.*IR not printable:UserWarning") - def test_unroll_chunk(self): - """Chunk an unroll loop: both inner and outer loops are Unroll. - - Since the DSL does not support pl.unroll() with init_values, - we verify the IR structure properties directly instead of - using structural equality. - """ - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.unroll(0, 12, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, 1.0) - return x - - Before = _prepare_for_split(Input) - After = passes.split_chunked_loops()(Before) - - # Extract the function body - stmts = _top_level_stmts(After) - - # Body should be SeqStmts: [auto_incore_scope, return] - assert len(stmts) == 2 # auto_incore scope + return - - # The first stmt is the AutoInCore scope - scope = cast(ir.ScopeStmt, stmts[0]) - assert scope.scope_kind == ir.ScopeKind.AutoInCore - - # Inside the scope is the outer for loop - outer_for = cast(ir.ForStmt, scope.body) - assert outer_for.kind == ir.ForKind.Unroll - assert len(outer_for.iter_args) == 1 - assert len(outer_for.return_vars) == 1 - - # Outer loop bounds: range(0, 3, 1) — 12/4 = 3 full chunks - assert cast(ir.ConstInt, outer_for.start).value == 0 - assert cast(ir.ConstInt, outer_for.stop).value == 3 - - # Inner loop is inside outer body (SeqStmts: [inner_for, yield]) - outer_body_stmts = _body_stmts(outer_for.body) - inner_for = cast(ir.ForStmt, outer_body_stmts[0]) - assert inner_for.kind == ir.ForKind.Unroll - assert len(inner_for.iter_args) == 1 - assert len(inner_for.return_vars) == 1 - - # Inner loop bounds: range(0, 4, 1) - assert cast(ir.ConstInt, inner_for.start).value == 0 - assert cast(ir.ConstInt, inner_for.stop).value == 4 - - -class TestPrinterRoundTrip: - """Tests for printer output with chunk kwargs.""" - - def test_chunk_in_printer(self): - """Verify that chunk kwarg is printed correctly.""" - - @pl.program - class Before: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.range(0, 10, 1, chunk=5, chunk_policy="leading_full"): - x = pl.add(x, 1.0) - return x - - printed = python_print(Before) - assert "chunk=5" in printed - - def test_parallel_chunk_in_printer(self): - """Verify parallel chunk kwarg is printed.""" - - @pl.program - class Before: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, 1.0) - return x - - printed = python_print(Before) - assert "chunk=4" in printed - assert "pl.parallel" in printed - - -class TestParserErrors: - """Tests for parser validation of chunk arguments.""" - - def test_chunk_with_init_values_allowed(self): - """chunk + init_values should be allowed (not raise parser error).""" - - @pl.program - class Good: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i, (s,) in pl.range(10, init_values=(x,), chunk=5, chunk_policy="leading_full"): - s = pl.add(s, 1.0) - s = pl.yield_(s) - return x - - def test_chunk_zero_error(self): - """chunk=0 should raise parser error.""" - with pytest.raises(Exception, match="chunk must be a positive integer"): - - @pl.program - class Bad: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.range(0, 10, 1, chunk=0, chunk_policy="leading_full"): - x = pl.add(x, 1.0) - return x - - def test_chunk_negative_error(self): - """chunk=-1 should raise parser error.""" - with pytest.raises(Exception, match="chunk must be a positive integer"): - - @pl.program - class Bad: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.range(0, 10, 1, chunk=-1): - x = pl.add(x, 1.0) - return x - - -class TestLoopOrigin: - """Tests for LoopOrigin annotation set by SplitChunkedLoops.""" - - def _get_func_body_stmts(self, program): - """Get the top-level statements from the first function's body.""" - return _top_level_stmts(program) - - def _get_auto_incore_body_stmts(self, program): - """Get statements inside the AutoInCore scope.""" - stmts = self._get_func_body_stmts(program) - # First stmt should be AutoInCore scope - scope = cast(ir.ScopeStmt, stmts[0]) - assert scope.scope_kind == ir.ScopeKind.AutoInCore - body = scope.body - # Body may be a single stmt or SeqStmts - if hasattr(body, "stmts"): - return _body_stmts(body) - return [body] - - def test_divisible_chunk_origin(self): - """Verify outer=ChunkOuter, inner=ChunkInner for divisible chunks.""" - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.range(0, 10, 1, chunk=5, chunk_policy="leading_full"): - x = pl.add(x, 1.0) - return x - - Before = _prepare_for_split(Input) - After = passes.split_chunked_loops()(Before) - - inner_stmts = self._get_auto_incore_body_stmts(After) - outer_for = cast(ir.ForStmt, inner_stmts[0]) - - assert outer_for.attrs.get("loop_origin") == ir.LoopOrigin.ChunkOuter - - # Inner loop is inside outer body - outer_body_stmts = _body_stmts(outer_for.body) - inner_for = cast(ir.ForStmt, outer_body_stmts[0]) - assert inner_for.attrs.get("loop_origin") == ir.LoopOrigin.ChunkInner - - def test_non_divisible_chunk_origin(self): - """Verify outer=ChunkOuter, inner=ChunkInner, remainder=ChunkRemainder.""" - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.range(0, 7, 1, chunk=5, chunk_policy="leading_full"): - x = pl.add(x, 1.0) - return x - - Before = _prepare_for_split(Input) - After = passes.split_chunked_loops()(Before) - - inner_stmts = self._get_auto_incore_body_stmts(After) - # stmts: [outer_for, remainder_for] - outer_for = cast(ir.ForStmt, inner_stmts[0]) - remainder_for = cast(ir.ForStmt, inner_stmts[1]) - - assert outer_for.attrs.get("loop_origin") == ir.LoopOrigin.ChunkOuter - - outer_body_stmts = _body_stmts(outer_for.body) - inner_for = cast(ir.ForStmt, outer_body_stmts[0]) - assert inner_for.attrs.get("loop_origin") == ir.LoopOrigin.ChunkInner - - assert remainder_for.attrs.get("loop_origin") == ir.LoopOrigin.ChunkRemainder - - def test_all_remainder_origin(self): - """Verify remainder=ChunkRemainder when trip_count < chunk_size.""" - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.range(0, 3, 1, chunk=5, chunk_policy="leading_full"): - x = pl.add(x, 1.0) - return x - - Before = _prepare_for_split(Input) - After = passes.split_chunked_loops()(Before) - - inner_stmts = self._get_auto_incore_body_stmts(After) - remainder_for = cast(ir.ForStmt, inner_stmts[0]) - assert remainder_for.attrs.get("loop_origin") == ir.LoopOrigin.ChunkRemainder - - def test_non_chunked_loop_origin(self): - """Verify regular (non-chunked) loops have Original origin.""" - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - for i in pl.range(0, 10, 1): - x = pl.add(x, 1.0) - return x - - Before = _prepare_for_split(Input) - After = passes.split_chunked_loops()(Before) - - stmts = self._get_func_body_stmts(After) - for_stmt = cast(ir.ForStmt, stmts[0]) - assert "loop_origin" not in for_stmt.attrs - - -class TestNestedChunking: - """Tests for nested chunked loops with iter_args propagation.""" - - def test_nested_outer_divisible_inner_remainder(self): - """Nested chunks: outer divisible, inner only remainder. - - Reproduces the bug where inner remainder loop's init_values - referenced the original (unsplit) iter_arg instead of the - inner iter_arg from the outer loop's split. - """ - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.parallel(8, chunk=4, chunk_policy="leading_full"): - for j in pl.parallel(1, chunk=2, chunk_policy="leading_full"): - x = pl.add(x, 1.0) - return x - - Before = _prepare_for_split(Input) - After = passes.split_chunked_loops()(Before) - - @pl.program - class Expected: - @pl.function(strict_ssa=True) - def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i_0_out, (x_iter_1_outer,) in pl.parallel( - 0, 2, 1, init_values=(x_0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - for i_0_in, (x_iter_1_inner,) in pl.parallel( - 0, - 4, - 1, - init_values=(x_iter_1_outer,), - attrs={"loop_origin": pl.LoopOrigin.ChunkInner}, - ): - for j_0_rem, (x_iter_3_rem,) in pl.parallel( - 0, - 1, - 1, - init_values=(x_iter_1_inner,), - attrs={"loop_origin": pl.LoopOrigin.ChunkRemainder}, - ): - x_5: pl.Tensor[[64], pl.FP32] = pl.tensor.add(x_iter_3_rem, 1.0) - x_iter_3_rem_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_5) - x_iter_1_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_iter_3_rem_rv) - x_iter_1_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_iter_1_inner_rv) - return x_iter_1_outer_rv - - ir.assert_structural_equal(After, _normalize_expected(Expected)) - - def test_nested_both_divisible(self): - """Nested chunks: both outer and inner divisible.""" - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.parallel(8, chunk=4, chunk_policy="leading_full"): - for j in pl.parallel(12, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, 1.0) - return x - - Before = _prepare_for_split(Input) - After = passes.split_chunked_loops()(Before) - - @pl.program - class Expected: - @pl.function(strict_ssa=True) - def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i_0_out, (x_iter_1_outer,) in pl.parallel( - 0, 2, 1, init_values=(x_0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - for i_0_in, (x_iter_1_inner,) in pl.parallel( - 0, - 4, - 1, - init_values=(x_iter_1_outer,), - attrs={"loop_origin": pl.LoopOrigin.ChunkInner}, - ): - for j_0_out, (x_iter_3_outer,) in pl.parallel( - 0, - 3, - 1, - init_values=(x_iter_1_inner,), - attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}, - ): - for j_0_in, (x_iter_3_inner,) in pl.parallel( - 0, - 4, - 1, - init_values=(x_iter_3_outer,), - attrs={"loop_origin": pl.LoopOrigin.ChunkInner}, - ): - x_5: pl.Tensor[[64], pl.FP32] = pl.tensor.add(x_iter_3_inner, 1.0) - x_iter_3_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_5) - x_iter_3_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_iter_3_inner_rv) - x_iter_1_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_iter_3_outer_rv) - x_iter_1_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_iter_1_inner_rv) - return x_iter_1_outer_rv - - ir.assert_structural_equal(After, _normalize_expected(Expected)) - - def test_nested_both_remainder(self): - """Nested chunks: both outer and inner have remainders. - - Verifies init_values are correctly substituted in all paths: - outer-inner, outer-remainder, remainder-inner, remainder-remainder. - """ - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.parallel(6, chunk=4, chunk_policy="leading_full"): - for j in pl.parallel(3, chunk=2, chunk_policy="leading_full"): - x = pl.add(x, 1.0) - return x - - Before = _prepare_for_split(Input) - After = passes.split_chunked_loops()(Before) - - printed = python_print(After) - init_refs = re.findall(r"init_values=\((\w+),\)", printed) - for ref in init_refs: - assert ref != "x__iter_v1", ( - "Found bare 'x__iter_v1' in init_values; should be a chunk-qualified iter name." - ) - assert ref != "x__iter_v3", ( - "Found bare 'x__iter_v3' in init_values; should be a chunk-qualified iter name." - ) - - -class TestDynamicChunking: - """Tests for chunked loops where start/stop are dynamic (runtime) scalars.""" - - @staticmethod - def _split_and_simplify(program): - """Run prerequisite passes, split chunked loops, and simplify expressions.""" - prepared = _prepare_for_split(program) - split = passes.split_chunked_loops()(prepared) - return passes.simplify()(split) - - def test_dynamic_stop(self): - """Dynamic stop: outer+inner+remainder with FloorDiv/FloorMod bounds.""" - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32], n: pl.Scalar[pl.INDEX]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.range(0, n, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, 1.0) - return x - - After = self._split_and_simplify(Input) - - @pl.program - class Expected: - @pl.function(strict_ssa=True) - def main( - self, x_0: pl.Tensor[[64], pl.FP32], n_0: pl.Scalar[pl.INDEX] - ) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i_out, (x_outer,) in pl.range( - 0, n_0 // 4, 1, init_values=(x_0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - for i_in, (x_inner,) in pl.range( - 0, 4, 1, init_values=(x_outer,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - x_3: pl.Tensor[[64], pl.FP32] = pl.add(x_inner, 1.0) - x_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3) - x_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner_rv) - for i_rem, (x_rem,) in pl.range( - 0, - n_0 % 4, - 1, - init_values=(x_outer_rv,), - attrs={"loop_origin": pl.LoopOrigin.ChunkRemainder}, - ): - x_4: pl.Tensor[[64], pl.FP32] = pl.add(x_rem, 1.0) - x_rem_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_4) - return x_rem_rv - - ir.assert_structural_equal(After, _normalize_expected(Expected)) - - def test_dynamic_start_and_stop(self): - """Both start and stop are dynamic.""" - - @pl.program - class Input: - @pl.function - def main( - self, - x: pl.Tensor[[64], pl.FP32], - lo: pl.Scalar[pl.INDEX], - hi: pl.Scalar[pl.INDEX], - ) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.range(lo, hi, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, 1.0) - return x - - After = self._split_and_simplify(Input) - - @pl.program - class Expected: - @pl.function(strict_ssa=True) - def main( - self, - x_0: pl.Tensor[[64], pl.FP32], - lo_0: pl.Scalar[pl.INDEX], - hi_0: pl.Scalar[pl.INDEX], - ) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i_out, (x_outer,) in pl.range( - 0, - pl.max(hi_0 - lo_0, 0) // 4, - 1, - init_values=(x_0,), - attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}, - ): - for i_in, (x_inner,) in pl.range( - 0, 4, 1, init_values=(x_outer,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - x_3: pl.Tensor[[64], pl.FP32] = pl.add(x_inner, 1.0) - x_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3) - x_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner_rv) - for i_rem, (x_rem,) in pl.range( - 0, - pl.max(hi_0 - lo_0, 0) % 4, - 1, - init_values=(x_outer_rv,), - attrs={"loop_origin": pl.LoopOrigin.ChunkRemainder}, - ): - x_4: pl.Tensor[[64], pl.FP32] = pl.add(x_rem, 1.0) - x_rem_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_4) - return x_rem_rv - - ir.assert_structural_equal(After, _normalize_expected(Expected)) - - def test_dynamic_stop_parallel(self): - """Dynamic stop with pl.parallel should also work.""" - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32], n: pl.Scalar[pl.INDEX]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.parallel(0, n, 1, chunk=4, chunk_policy="leading_full"): - x = pl.add(x, 1.0) - return x - - After = self._split_and_simplify(Input) - - @pl.program - class Expected: - @pl.function(strict_ssa=True) - def main( - self, x_0: pl.Tensor[[64], pl.FP32], n_0: pl.Scalar[pl.INDEX] - ) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i_out, (x_outer,) in pl.parallel( - 0, n_0 // 4, 1, init_values=(x_0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - for i_in, (x_inner,) in pl.parallel( - 0, 4, 1, init_values=(x_outer,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner} - ): - x_3: pl.Tensor[[64], pl.FP32] = pl.add(x_inner, 1.0) - x_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3) - x_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner_rv) - for i_rem, (x_rem,) in pl.parallel( - 0, - n_0 % 4, - 1, - init_values=(x_outer_rv,), - attrs={"loop_origin": pl.LoopOrigin.ChunkRemainder}, - ): - x_4: pl.Tensor[[64], pl.FP32] = pl.add(x_rem, 1.0) - x_rem_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_4) - return x_rem_rv - - ir.assert_structural_equal(After, _normalize_expected(Expected)) - - def test_static_still_works(self): - """Regression: static bounds should continue to produce same IR as before.""" - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i in pl.range(0, 10, 1, chunk=5, chunk_policy="leading_full"): - x = pl.add(x, 1.0) - return x - - Before = _prepare_for_split(Input) - After = passes.split_chunked_loops()(Before) - - @pl.program - class Expected: - @pl.function(strict_ssa=True) - def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i_0_out, (x_iter_1_outer,) in pl.range( - 0, 2, 1, init_values=(x_0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - for i_0_in, (x_iter_1_inner,) in pl.range( - 0, - 5, - 1, - init_values=(x_iter_1_outer,), - attrs={"loop_origin": pl.LoopOrigin.ChunkInner}, - ): - x_3: pl.Tensor[[64], pl.FP32] = pl.add(x_iter_1_inner, 1.0) - x_iter_1_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3) - x_iter_1_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_iter_1_inner_rv) - return x_iter_1_outer_rv - - ir.assert_structural_equal(After, _normalize_expected(Expected)) - - -class TestGuardedPolicy: - """Tests for the `guarded` chunk policy. - - Guarded mode emits a single outer loop over ceil(T/C) chunks and an inner - loop of size C, with the body wrapped in `if idx < stop` so out-of-range - iterations become no-ops. With iter_args, the guard becomes an IfStmt phi - whose else branch passes the inner iter_args through unchanged. - """ - - @staticmethod - def _split_and_simplify(program): - """Prepare, split, then simplify so conditions compare cleanly.""" - prepared = _prepare_for_split(program) - split = passes.split_chunked_loops()(prepared) - return passes.simplify()(split) - - def test_guarded_is_default(self): - """Omitting chunk_policy selects Guarded.""" - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for _i in pl.range(7, chunk=5): - x = pl.add(x, 1.0) - return x - - @pl.program - class InputExplicit: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for _i in pl.range(7, chunk=5, chunk_policy="guarded"): - x = pl.add(x, 1.0) - return x - - # Default and explicit "guarded" must produce identical IR. - After = passes.split_chunked_loops()(_prepare_for_split(Input)) - AfterExplicit = passes.split_chunked_loops()(_prepare_for_split(InputExplicit)) - ir.assert_structural_equal(After, AfterExplicit) - - def test_guarded_divisible_iter_args(self): - """Static bound, trip_count divisible by chunk_size, with iter_args.""" - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for _i in pl.range(10, chunk=5, chunk_policy="guarded"): - x = pl.add(x, 1.0) - return x - - After = self._split_and_simplify(Input) - - @pl.program - class Expected: - @pl.function(strict_ssa=True) - def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i_out, (x_outer,) in pl.range( - 2, init_values=(x_0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - for i_in, (x_inner,) in pl.range( - 5, - init_values=(x_outer,), - attrs={"loop_origin": pl.LoopOrigin.ChunkInner}, - ): - if i_out * 5 + i_in < 10: - x_3: pl.Tensor[[64], pl.FP32] = pl.add(x_inner, 1.0) - x_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3) - else: - x_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner) - x_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_if) - x_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner_rv) - return x_outer_rv - - ir.assert_structural_equal(After, _normalize_expected(Expected)) - - def test_guarded_non_divisible_iter_args(self): - """Static bound, trip_count NOT divisible by chunk_size: ceil(7/5)=2 outer chunks. - - The guard `idx < 7` disables lanes 7..9 in the second outer chunk, - and the else branch threads the inner iter_args through unchanged. - """ - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for _i in pl.range(7, chunk=5, chunk_policy="guarded"): - x = pl.add(x, 1.0) - return x - - After = self._split_and_simplify(Input) - - @pl.program - class Expected: - @pl.function(strict_ssa=True) - def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i_out, (x_outer,) in pl.range( - 2, init_values=(x_0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - for i_in, (x_inner,) in pl.range( - 5, - init_values=(x_outer,), - attrs={"loop_origin": pl.LoopOrigin.ChunkInner}, - ): - if i_out * 5 + i_in < 7: - x_3: pl.Tensor[[64], pl.FP32] = pl.add(x_inner, 1.0) - x_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3) - else: - x_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner) - x_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_if) - x_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner_rv) - return x_outer_rv - - ir.assert_structural_equal(After, _normalize_expected(Expected)) - - def test_guarded_trip_less_than_chunk(self): - """trip_count < chunk_size: ceil(3/5)=1 outer chunk, inner guard masks lanes >= 3.""" - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for _i in pl.range(3, chunk=5, chunk_policy="guarded"): - x = pl.add(x, 1.0) - return x - - After = self._split_and_simplify(Input) - - @pl.program - class Expected: - @pl.function(strict_ssa=True) - def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i_out, (x_outer,) in pl.range( - 1, init_values=(x_0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - for i_in, (x_inner,) in pl.range( - 5, - init_values=(x_outer,), - attrs={"loop_origin": pl.LoopOrigin.ChunkInner}, - ): - # Simplify proves i_out is always 0 (outer range [0,1)). - if i_in < 3: - x_3: pl.Tensor[[64], pl.FP32] = pl.add(x_inner, 1.0) - x_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3) - else: - x_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner) - x_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_if) - x_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner_rv) - return x_outer_rv - - ir.assert_structural_equal(After, _normalize_expected(Expected)) - - def test_guarded_no_iter_args(self): - """No iter_args: IfStmt has no phi and no else branch — body runs or is skipped.""" - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for _i in pl.range(7, chunk=5, chunk_policy="guarded"): - _tmp = pl.add(x, 1.0) - return x - - After = self._split_and_simplify(Input) - - @pl.program - class Expected: - @pl.function(strict_ssa=True) - def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i_out in pl.range(2, attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}): - for i_in in pl.range(5, attrs={"loop_origin": pl.LoopOrigin.ChunkInner}): - if i_out * 5 + i_in < 7: - _tmp: pl.Tensor[[64], pl.FP32] = pl.add(x_0, 1.0) - return x_0 - - ir.assert_structural_equal(After, _normalize_expected(Expected)) - - def test_guarded_with_step(self): - """Non-unit step: guard compares `idx * step < stop`, idx = (out*C + in).""" - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for _i in pl.range(0, 20, 2, chunk=5, chunk_policy="guarded"): - x = pl.add(x, 1.0) - return x - - After = self._split_and_simplify(Input) - - @pl.program - class Expected: - @pl.function(strict_ssa=True) - def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i_out, (x_outer,) in pl.range( - 2, init_values=(x_0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - for i_in, (x_inner,) in pl.range( - 5, - init_values=(x_outer,), - attrs={"loop_origin": pl.LoopOrigin.ChunkInner}, - ): - if (i_out * 5 + i_in) * 2 < 20: - x_3: pl.Tensor[[64], pl.FP32] = pl.add(x_inner, 1.0) - x_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3) - else: - x_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner) - x_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_if) - x_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner_rv) - return x_outer_rv - - ir.assert_structural_equal(After, _normalize_expected(Expected)) - - def test_guarded_parallel(self): - """pl.parallel: both outer and inner guarded loops are Parallel kind.""" - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for _i in pl.parallel(8, chunk=4, chunk_policy="guarded"): - x = pl.add(x, 1.0) - return x - - After = self._split_and_simplify(Input) - - @pl.program - class Expected: - @pl.function(strict_ssa=True) - def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i_out, (x_outer,) in pl.parallel( - 2, init_values=(x_0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - for i_in, (x_inner,) in pl.parallel( - 4, - init_values=(x_outer,), - attrs={"loop_origin": pl.LoopOrigin.ChunkInner}, - ): - if i_out * 4 + i_in < 8: - x_3: pl.Tensor[[64], pl.FP32] = pl.add(x_inner, 1.0) - x_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3) - else: - x_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner) - x_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_if) - x_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner_rv) - return x_outer_rv - - ir.assert_structural_equal(After, _normalize_expected(Expected)) - - def test_guarded_dynamic_stop(self): - """Dynamic stop `n`: outer count = ceil(n/4) = (n + 3) // 4.""" - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32], n: pl.Scalar[pl.INDEX]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for _i in pl.range(n, chunk=4, chunk_policy="guarded"): - x = pl.add(x, 1.0) - return x - - After = self._split_and_simplify(Input) - - @pl.program - class Expected: - @pl.function(strict_ssa=True) - def main( - self, x_0: pl.Tensor[[64], pl.FP32], n_0: pl.Scalar[pl.INDEX] - ) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i_out, (x_outer,) in pl.range( - (n_0 + 3) // 4, - init_values=(x_0,), - attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}, - ): - for i_in, (x_inner,) in pl.range( - 4, - init_values=(x_outer,), - attrs={"loop_origin": pl.LoopOrigin.ChunkInner}, - ): - if i_out * 4 + i_in < n_0: - x_3: pl.Tensor[[64], pl.FP32] = pl.add(x_inner, 1.0) - x_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3) - else: - x_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner) - x_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_if) - x_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner_rv) - return x_outer_rv - - ir.assert_structural_equal(After, _normalize_expected(Expected)) - - def test_guarded_dynamic_start_and_stop(self): - """Dynamic start AND stop: outer count = ceil(max(hi-lo, 0) / 4).""" - - @pl.program - class Input: - @pl.function - def main( - self, - x: pl.Tensor[[64], pl.FP32], - lo: pl.Scalar[pl.INDEX], - hi: pl.Scalar[pl.INDEX], - ) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for _i in pl.range(lo, hi, 1, chunk=4, chunk_policy="guarded"): - x = pl.add(x, 1.0) - return x - - After = self._split_and_simplify(Input) - - @pl.program - class Expected: - @pl.function(strict_ssa=True) - def main( - self, - x_0: pl.Tensor[[64], pl.FP32], - lo_0: pl.Scalar[pl.INDEX], - hi_0: pl.Scalar[pl.INDEX], - ) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i_out, (x_outer,) in pl.range( - (pl.max(hi_0 - lo_0, 0) + 3) // 4, - init_values=(x_0,), - attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}, - ): - for i_in, (x_inner,) in pl.range( - 4, - init_values=(x_outer,), - attrs={"loop_origin": pl.LoopOrigin.ChunkInner}, - ): - if lo_0 + (i_out * 4 + i_in) < hi_0: - x_3: pl.Tensor[[64], pl.FP32] = pl.add(x_inner, 1.0) - x_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3) - else: - x_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner) - x_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_if) - x_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner_rv) - return x_outer_rv - - ir.assert_structural_equal(After, _normalize_expected(Expected)) - - def test_guarded_dynamic_no_iter_args(self): - """Dynamic bound with no iter_args: IfStmt has no phi.""" - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32], n: pl.Scalar[pl.INDEX]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for _i in pl.range(n, chunk=4, chunk_policy="guarded"): - _tmp = pl.add(x, 1.0) - return x - - After = self._split_and_simplify(Input) - - @pl.program - class Expected: - @pl.function(strict_ssa=True) - def main( - self, x_0: pl.Tensor[[64], pl.FP32], n_0: pl.Scalar[pl.INDEX] - ) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i_out in pl.range((n_0 + 3) // 4, attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}): - for i_in in pl.range(4, attrs={"loop_origin": pl.LoopOrigin.ChunkInner}): - if i_out * 4 + i_in < n_0: - _tmp: pl.Tensor[[64], pl.FP32] = pl.add(x_0, 1.0) - return x_0 - - ir.assert_structural_equal(After, _normalize_expected(Expected)) - - def test_guarded_nested(self): - """Nested guarded loops: inner guarded loop lives inside outer's then-branch. - - Verifies iter_args thread correctly through both levels of IfStmt phi. - """ - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for _i in pl.parallel(8, chunk=4, chunk_policy="guarded"): - for _j in pl.parallel(3, chunk=2, chunk_policy="guarded"): - x = pl.add(x, 1.0) - return x - - After = self._split_and_simplify(Input) - - @pl.program - class Expected: - @pl.function(strict_ssa=True) - def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i_out, (x_outer,) in pl.parallel( - 2, init_values=(x_0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - for i_in, (x_inner,) in pl.parallel( - 4, - init_values=(x_outer,), - attrs={"loop_origin": pl.LoopOrigin.ChunkInner}, - ): - if i_out * 4 + i_in < 8: - for j_out, (x_j_outer,) in pl.parallel( - 2, - init_values=(x_inner,), - attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}, - ): - for j_in, (x_j_inner,) in pl.parallel( - 2, - init_values=(x_j_outer,), - attrs={"loop_origin": pl.LoopOrigin.ChunkInner}, - ): - if j_out * 2 + j_in < 3: - x_5: pl.Tensor[[64], pl.FP32] = pl.add(x_j_inner, 1.0) - x_j_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_5) - else: - x_j_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_j_inner) - x_j_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_j_if) - x_j_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_j_inner_rv) - x_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_j_outer_rv) - else: - x_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner) - x_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_if) - x_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner_rv) - return x_outer_rv - - ir.assert_structural_equal(After, _normalize_expected(Expected)) - - def test_guarded_negative_step(self): - """Descending chunked range: guard uses `idx > stop` since step < 0. - - Regression test: the initial implementation built the guard as `idx < stop` - unconditionally, which made every iteration of a descending loop a no-op. - """ - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for _i in pl.range(10, 0, -1, chunk=4, chunk_policy="guarded"): - x = pl.add(x, 1.0) - return x - - After = self._split_and_simplify(Input) - - @pl.program - class Expected: - @pl.function(strict_ssa=True) - def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i_out, (x_outer,) in pl.range( - 3, init_values=(x_0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter} - ): - for i_in, (x_inner,) in pl.range( - 4, - init_values=(x_outer,), - attrs={"loop_origin": pl.LoopOrigin.ChunkInner}, - ): - # Original guard: 10 + (i_out*4 + i_in) * -1 > 0 - # Simplify rearranges stop to the left-hand side. - if -10 < (i_out * 4 + i_in) * -1: - x_3: pl.Tensor[[64], pl.FP32] = pl.add(x_inner, 1.0) - x_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3) - else: - x_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner) - x_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_if) - x_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner_rv) - return x_outer_rv - - ir.assert_structural_equal(After, _normalize_expected(Expected)) - - def test_guarded_negative_step_no_iter_args(self): - """Descending chunked range without iter_args: guard still uses `idx > stop`.""" - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for _i in pl.range(10, 0, -1, chunk=4, chunk_policy="guarded"): - _tmp = pl.add(x, 1.0) - return x - - After = self._split_and_simplify(Input) - - @pl.program - class Expected: - @pl.function(strict_ssa=True) - def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for i_out in pl.range(3, attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}): - for i_in in pl.range(4, attrs={"loop_origin": pl.LoopOrigin.ChunkInner}): - if -10 < (i_out * 4 + i_in) * -1: - _tmp: pl.Tensor[[64], pl.FP32] = pl.add(x_0, 1.0) - return x_0 - - ir.assert_structural_equal(After, _normalize_expected(Expected)) - - def test_guarded_origin_attrs(self): - """Guarded mode sets ChunkOuter/ChunkInner attrs and never emits ChunkRemainder.""" - - @pl.program - class Input: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for _i in pl.range(7, chunk=5, chunk_policy="guarded"): - x = pl.add(x, 1.0) - return x - - Before = _prepare_for_split(Input) - After = passes.split_chunked_loops()(Before) - - # Navigate: [ScopeStmt, return]; ScopeStmt.body = outer_for (guarded is a single for) - stmts = _top_level_stmts(After) - scope = cast(ir.ScopeStmt, stmts[0]) - outer_for = cast(ir.ForStmt, scope.body) - assert outer_for.attrs.get("loop_origin") == ir.LoopOrigin.ChunkOuter - - inner_for = cast(ir.ForStmt, _body_stmts(outer_for.body)[0]) - assert inner_for.attrs.get("loop_origin") == ir.LoopOrigin.ChunkInner - - # No remainder loop should exist. - printed = python_print(After) - assert "ChunkRemainder" not in printed - - def test_guarded_printer_omits_default(self): - """Printer omits `chunk_policy="guarded"` (it's the default) but prints `leading_full`.""" - - @pl.program - class Guarded: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for _i in pl.range(10, chunk=5, chunk_policy="guarded"): - x = pl.add(x, 1.0) - return x - - @pl.program - class LeadingFull: - @pl.function - def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer): - for _i in pl.range(10, chunk=5, chunk_policy="leading_full"): - x = pl.add(x, 1.0) - return x - - guarded_printed = python_print(Guarded) - leading_printed = python_print(LeadingFull) - assert "chunk_policy" not in guarded_printed - assert 'chunk_policy="leading_full"' in leading_printed - - -if __name__ == "__main__": - pytest.main([__file__, "-v"]) diff --git a/tests/ut/ir/transforms/test_transform_utils.py b/tests/ut/ir/transforms/test_transform_utils.py index 0d5346de4..8fbbea739 100644 --- a/tests/ut/ir/transforms/test_transform_utils.py +++ b/tests/ut/ir/transforms/test_transform_utils.py @@ -152,7 +152,7 @@ def test_scope_stmt(self): """Collects vars from ScopeStmt body.""" v1, s1 = _assign("s", _const(7)) body = ir.SeqStmts([s1], _span()) - scope = ir.InCoreScopeStmt(body=body, span=_span()) + scope = ir.HierarchyScopeStmt(level=ir.Level.CORE_GROUP, body=body, span=_span()) result = ir.collect_def_vars(scope) assert len(result) == 1 assert result[0] is v1 diff --git a/tests/ut/language/parser/test_error_cases.py b/tests/ut/language/parser/test_error_cases.py index 4456d93c9..f09e76197 100644 --- a/tests/ut/language/parser/test_error_cases.py +++ b/tests/ut/language/parser/test_error_cases.py @@ -122,29 +122,6 @@ def bad_target(n: pl.Tensor[[1], pl.INT32]) -> pl.Tensor[[1], pl.INT32]: return result - def test_chunked_loop_requires_auto_incore(self): - """Test that chunked loops are rejected outside auto_incore scope.""" - code = """ -import pypto.language as pl - -@pl.program -class ChunkedLoopProgram: - @pl.function(type=pl.FunctionType.Orchestration) - def main( - self, - x: pl.Tensor[[16, 4], pl.FP32], - seq_lens: pl.Tensor[[16], pl.INT32], - ) -> pl.Tensor[[16, 4], pl.FP32]: - for b in pl.parallel(0, 16, 1, chunk=4, chunk_policy="leading_full"): - _ctx_len = pl.tensor.read(seq_lens, [b]) - return x -""" - with pytest.raises( - ParserSyntaxError, - match=r"chunk=\.\.\. loops are only valid inside with pl\.at", - ): - pl.parse_program(code) - def test_unknown_tensor_operation(self): """Test error on unknown tensor operation.""" diff --git a/tests/ut/language/parser/test_scope_parsing.py b/tests/ut/language/parser/test_scope_parsing.py index 4ff230b80..80e6ce291 100644 --- a/tests/ut/language/parser/test_scope_parsing.py +++ b/tests/ut/language/parser/test_scope_parsing.py @@ -133,7 +133,7 @@ def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: scope_stmt = body assert isinstance(scope_stmt, ir.ScopeStmt) assert scope_stmt.name_hint == "my_kernel" - assert scope_stmt.scope_kind == ir.ScopeKind.InCore + assert scope_stmt.scope_kind == ir.ScopeKind.Hierarchy def test_parse_unnamed_scope_has_empty_name(self): """Test that unnamed scopes have empty name.""" diff --git a/tests/ut/language/test_range_unroll_kwarg.py b/tests/ut/language/test_range_unroll_kwarg.py index 79adedc61..dcf1cbf13 100644 --- a/tests/ut/language/test_range_unroll_kwarg.py +++ b/tests/ut/language/test_range_unroll_kwarg.py @@ -113,7 +113,7 @@ def test_unroll_with_chunk_rejected(self): class _P: @pl.function def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]: - with pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.auto_chunk]): + with pl.at(level=pl.Level.CORE_GROUP): for i in pl.range(8, chunk=4, unroll=2): x = pl.add(x, 1.0) return x