diff --git a/.claude/rules/pass-doc-ordering.md b/.claude/rules/pass-doc-ordering.md
index 48ac9c92d..89d39049f 100644
--- a/.claude/rules/pass-doc-ordering.md
+++ b/.claude/rules/pass-doc-ordering.md
@@ -17,19 +17,18 @@ Developers read pass docs sequentially to understand the compilation pipeline. I
 | 02 | `02-ctrl_flow_transform.md` | 2nd pass |
 | 03 | `03-convert_to_ssa.md` | 3rd pass |
 | 04 | `04-flatten_call_expr.md` | 4th pass |
-| 05 | `05-split_chunked_loops.md` | 5th pass |
-| 06 | `06-interchange_chunk_loops.md` | 6th pass |
-| 07 | `07-outline_incore_scopes.md` | 7th pass |
-| 08 | `08-outline_cluster_scopes.md` | 8th pass |
-| 09 | `09-convert_tensor_to_tile_ops.md` | 9th pass |
-| 10 | `10-optimize_orch_tensors.md` | 10th pass |
-| 11 | `11-flatten_tile_nd_to_2d.md` | 11th pass |
-| 12 | *(no doc yet)* | 12th pass (`InferTileMemorySpace`) |
-| 13 | *(no doc yet)* | 13th pass (`ResolveTransposeLayout`) |
-| 14 | `14-expand_mixed_kernel.md` | 14th pass |
-| 15 | `15-init_memref.md` | 15th pass |
-| 16 | `16-memory_reuse.md` | 16th pass |
-| 17 | `17-allocate_memory_addr.md` | 17th pass |
+| 05 | `05-outline_hierarchy_scopes.md` | 5th pass (non-CORE_GROUP → `Opaque`) |
+| 06 | `06-outline_incore_scopes.md` | 6th pass (CORE_GROUP → `InCore`, promote parent) |
+| 07 | `07-outline_cluster_scopes.md` | 7th pass |
+| 08 | `08-convert_tensor_to_tile_ops.md` | 8th pass |
+| 09 | `09-optimize_orch_tensors.md` | 9th pass |
+| 10 | `10-flatten_tile_nd_to_2d.md` | 10th pass |
+| 11 | `11-expand_mixed_kernel.md` | 11th pass |
+| 12 | `12-init_memref.md` | 12th pass |
+| 13 | `13-memory_reuse.md` | 13th pass |
+| 14 | `14-allocate_memory_addr.md` | 14th pass |
+| 15 | `15-partial_unroll_tile_loops.md` | 15th pass |
+| 16 | `16-reorder_unrolled_io.md` | 16th pass |
 | 90 | `90-insert_sync.md` | Not in Default strategy |
 | 91 | `91-utility_passes.md` | Not in Default strategy |
 | 99 | `99-verifier.md` | Infrastructure (not a pipeline pass) |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b92a4a01f..27582902b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -147,9 +147,9 @@ set(PYPTO_SOURCES
     src/ir/transforms/mutator.cpp
     src/ir/transforms/normalize_stmt_structure_pass.cpp
     src/ir/transforms/op_conversion_registry.cpp
-    src/ir/transforms/outline_incore_scopes_pass.cpp
     src/ir/transforms/outline_cluster_scopes_pass.cpp
     src/ir/transforms/outline_hierarchy_scopes_pass.cpp
+    src/ir/transforms/outline_incore_scopes_pass.cpp
     src/ir/transforms/expand_mixed_kernel_pass.cpp
     src/ir/transforms/split_vector_kernel_pass.cpp
     src/ir/transforms/flatten_tile_nd_to_2d_pass.cpp
@@ -159,8 +159,6 @@ set(PYPTO_SOURCES
     src/ir/transforms/resolve_transpose_layout_pass.cpp
     src/ir/transforms/python_printer.cpp
     src/ir/transforms/simplify_pass.cpp
-    src/ir/transforms/split_chunked_loops_pass.cpp
-    src/ir/transforms/interchange_chunk_loops_pass.cpp
     src/ir/transforms/unroll_loops_pass.cpp
     src/ir/transforms/partial_unroll_tile_loops_pass.cpp
     src/ir/transforms/reorder_unrolled_io_pass.cpp
diff --git a/docs/en/dev/ir/01-hierarchy.md b/docs/en/dev/ir/01-hierarchy.md
index 18662121f..a09c7525d 100644
--- a/docs/en/dev/ir/01-hierarchy.md
+++ b/docs/en/dev/ir/01-hierarchy.md
@@ -32,7 +32,12 @@ This document provides a complete reference of all IR node types, organized by c
 <return_stmt> ::= "return" [ <var_list> ]
 <eval_stmt>  ::= <expr>
 <seq_stmts>  ::= <stmt> { ";" <stmt> }
-<scope_stmt> ::= "with" "pl.incore" "(" ")" ":" <stmt_list>
+<scope_stmt> ::= "with" "pl.at" "(" "level" "=" <level> [ "," "role" "=" <role> ]
+                 [ "," "optimizations" "=" "[" <optimization_list> "]" ] ")"
+                 ":" <stmt_list>
+               | "with" "pl.cluster" "(" ")" ":" <stmt_list>
+               | "with" "pl.spmd" "(" "core_num" "=" <expr>
+                 [ "," "sync_start" "=" <expr> ] ")" ":" <stmt_list>
 <break_stmt> ::= "break"
 <continue_stmt> ::= "continue"
 
@@ -153,10 +158,8 @@ field from the `Stmt` base class. See [Leading comments on statements](#leading-
 | **IfStmt** | `condition_`, `then_stmts_`, `else_stmts_`, `return_vars_` | Conditional branching |
 | **ForStmt** | `loop_var_` (DefField), `start_`, `stop_`, `step_`, `iter_args_` (DefField), `body_`, `return_vars_` (DefField), `kind_` | For loop with optional iteration args |
 | **WhileStmt** | `condition_`, `iter_args_` (DefField), `body_`, `return_vars_` (DefField) | While loop with condition and iteration args |
-| **InCoreScopeStmt** | `name_hint_`, `body_`, `split_` (optional) | InCore region; outlined to `Function(InCore)` |
-| **AutoInCoreScopeStmt** | `name_hint_`, `body_`, `split_` (optional) | Auto-InCore region; consumed by `InterchangeChunkLoops` |
 | **ClusterScopeStmt** | `name_hint_`, `body_` | Cluster region; outlined to `Function(Group)` |
-| **HierarchyScopeStmt** | `name_hint_`, `body_`, `level_`, `role_` (optional) | Pipeline-stage region for a given Level/Role |
+| **HierarchyScopeStmt** | `name_hint_`, `body_`, `level_`, `role_` (optional), `split_` (optional) | Pipeline-stage region for a given Level/Role; outlined to `Function(InCore)` when `level_ == CORE_GROUP` and to `Function(Opaque)` otherwise |
 | **SpmdScopeStmt** | `name_hint_`, `body_`, `core_num_`, `sync_start_` | SPMD launch region; outlined to `Function(Spmd)` |
 | **YieldStmt** | `values_` | Yield values in loop iteration |
 | **EvalStmt** | `expr_` | Evaluate expression for side effects |
@@ -252,25 +255,22 @@ while_stmt = ir.WhileStmt(condition, [x_iter], body, [x_final], span)
 ### ScopeStmt Details
 
 `ScopeStmt` is an **abstract base class** that marks a region with a specific
-execution context. The five concrete subclasses below each carry only the
+execution context. The three concrete subclasses below each carry only the
 fields valid for their kind — invalid combinations are unrepresentable at
 construction. Use `s.scope_kind` (or `s.GetScopeKind()` in C++) to recover the
-kind from a `ScopeStmt`-typed reference, or `isinstance(s, InCoreScopeStmt)`
+kind from a `ScopeStmt`-typed reference, or `isinstance(s, HierarchyScopeStmt)`
 to dispatch on the concrete type.
 
-All five share the common base fields `name_hint_: str` and `body_: StmtPtr`.
-Note that `pl.at(level=Level.CORE_GROUP)` lowers to `InCoreScopeStmt` /
-`AutoInCoreScopeStmt`, not `HierarchyScopeStmt` — the parser rejects `role=`
-at `CORE_GROUP`. `HierarchyScopeStmt` is reserved for non-`CORE_GROUP` levels
-(host, cluster, global) and is not a general replacement for in-core scopes.
+All three share the common base fields `name_hint_: str` and `body_: StmtPtr`.
+`pl.at(level=...)` always lowers to `HierarchyScopeStmt` — including the
+`level=Level.CORE_GROUP` form, which produces a `HierarchyScopeStmt` with
+`level_ == CORE_GROUP` and an optional `split_`. `OutlineIncoreScopes`
+later turns that `CORE_GROUP` scope into a `Function(InCore)` and re-types
+the parent `Opaque` function as `Orchestration`. Non-`CORE_GROUP`
+`HierarchyScopeStmt`s are outlined into `Function(Opaque)` by
+`OutlineHierarchyScopes` (which runs immediately before `OutlineIncoreScopes`).
 
 ```python
-# with pl.incore(): y = pl.add(x, x)
-in_core = ir.InCoreScopeStmt(name_hint="", body=body, span=span)
-
-# with pl.auto_incore():       (split is optional)
-auto = ir.AutoInCoreScopeStmt(name_hint="", body=body, span=span)
-
 # with pl.cluster():
 cluster = ir.ClusterScopeStmt(name_hint="", body=body, span=span)
 
@@ -278,6 +278,12 @@ cluster = ir.ClusterScopeStmt(name_hint="", body=body, span=span)
 hier = ir.HierarchyScopeStmt(level=ir.Level.HOST, role=ir.Role.Worker,
                              name_hint="", body=body, span=span)
 
+# with pl.at(level=Level.CORE_GROUP,
+#            optimizations=[pl.split(pl.SplitMode.UP_DOWN)]):
+hier_core = ir.HierarchyScopeStmt(level=ir.Level.CORE_GROUP,
+                                  split=ir.SplitMode.UP_DOWN,
+                                  name_hint="", body=body, span=span)
+
 # with pl.spmd(core_num=8):
 spmd = ir.SpmdScopeStmt(core_num=8, sync_start=False,
                         name_hint="", body=body, span=span)
@@ -289,20 +295,33 @@ spmd = ir.SpmdScopeStmt(core_num=8, sync_start=False,
   are not control flow (execute once, linearly).
 - Required fields are enforced at construction: `HierarchyScopeStmt.level_`
   is non-optional; `SpmdScopeStmt` rejects `core_num <= 0`.
-- `InCoreScopeStmt` / `AutoInCoreScopeStmt` are scheduled for deprecation;
-  prefer `HierarchyScopeStmt` or other surviving kinds in new code.
+- `HierarchyScopeStmt.split_` is optional and is only meaningful at
+  `Level.CORE_GROUP`. It is copied onto the outlined `InCore` function's
+  attrs so `ExpandMixedKernel` can read the hint.
 - Pass behavior:
-  - `InterchangeChunkLoops` consumes `AutoInCoreScopeStmt`
-  - `OutlineIncoreScopes` extracts `InCoreScopeStmt` into `Function(InCore)`
+  - `OutlineHierarchyScopes` extracts every non-`CORE_GROUP`
+    `HierarchyScopeStmt` into a dedicated `FunctionType::Opaque` function.
+    Parent function types are preserved.
+  - `OutlineIncoreScopes` (runs immediately after) extracts every
+    `CORE_GROUP` `HierarchyScopeStmt` into a dedicated `FunctionType::InCore`
+    function. Parents that contained at least one `CORE_GROUP` scope are
+    re-typed from `Opaque` to `Orchestration`.
   - `OutlineClusterScopes` extracts `ClusterScopeStmt` into `Function(Group)`
-    and standalone `SpmdScopeStmt` into `Function(Spmd)`
-  - `OutlineHierarchyScopes` extracts `HierarchyScopeStmt`
+    and standalone `SpmdScopeStmt` into `Function(Spmd)`.
 
 **Transformation:**
 
 ```python
-# Before: with pl.incore(): y = pl.add(x, x); return y
-# After: main_incore_0(x) -> y; main(x): y = main_incore_0(x); return y
+# Before:
+# def main(x):
+#     with pl.at(level=pl.Level.CORE_GROUP):
+#         y = pl.add(x, x)
+#     return y
+# After:
+# def main_core_group_0(x) -> y: ...         # FunctionType.InCore
+# def main(x) -> y:                           # FunctionType.Orchestration
+#     y = main_core_group_0(x)
+#     return y
 ```
 
 **Parallel for loop (ForKind):**
@@ -444,7 +463,7 @@ Functions stored in sorted map for deterministic ordering. GlobalVar names must
 | **Unary Ops** | 5 | Abs, Neg, Not, BitNot, Cast |
 | **Call/Access** | 2 | Call, TupleGetItemExpr |
 | **Operations** | 2 | Op, GlobalVar |
-| **Statements** | 15 | AssignStmt, IfStmt, ForStmt, WhileStmt, ReturnStmt, InCoreScopeStmt, AutoInCoreScopeStmt, ClusterScopeStmt, HierarchyScopeStmt, SpmdScopeStmt, YieldStmt, EvalStmt, SeqStmts, BreakStmt, ContinueStmt |
+| **Statements** | 13 | AssignStmt, IfStmt, ForStmt, WhileStmt, ReturnStmt, ClusterScopeStmt, HierarchyScopeStmt, SpmdScopeStmt, YieldStmt, EvalStmt, SeqStmts, BreakStmt, ContinueStmt |
 | **Types** | 6 | ScalarType, TensorType, TileType, TupleType, PipeType, UnknownType |
 | **Functions** | 2 | Function, Program |
 
diff --git a/docs/en/dev/language/00-python_syntax.md b/docs/en/dev/language/00-python_syntax.md
index 6a4033aed..5a882bca2 100644
--- a/docs/en/dev/language/00-python_syntax.md
+++ b/docs/en/dev/language/00-python_syntax.md
@@ -256,22 +256,17 @@ for i in pl.unroll(12, chunk=4):
     body_statements
 ```
 
-**Key points:** `chunk=C` splits the loop into an outer sequential loop and an inner loop of `C` iterations. The inner loop preserves the original kind (Sequential/Parallel/Unroll). `chunk` cannot be combined with `init_values`, and `chunk=` loops are only valid inside a `with pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.auto_chunk]):` — outside that scope the parser rejects them with an error. See [SplitChunkedLoops Pass](../passes/05-split_chunked_loops.md).
+**Key points:** `chunk=C` splits the loop into an outer sequential loop and an inner loop of `C` iterations. The inner loop preserves the original kind (Sequential/Parallel/Unroll). `chunk` cannot be combined with `init_values`.
 
 ### Scope Context Managers
 
-| Form | Scope Kind | Notes |
-| ---- | ---------- | ----- |
-| `pl.at(level=pl.Level.CORE_GROUP)` | `InCore` | Fixed-boundary outline at CORE_GROUP |
-| `pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.split(MODE)])` | `InCore` | InCore + cross-core split hint |
-| `pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.auto_chunk])` | `AutoInCore` | Compiler-driven chunked loop split |
-| `pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.auto_chunk, pl.split(MODE)])` | `AutoInCore` | AutoInCore + split hint (independent entries) |
-| `pl.at(level=pl.Level.HOST)` *(or any non-`CORE_GROUP` level)* | `Hierarchy` | Distributed hierarchy scope |
-| `pl.cluster()` | `Cluster` | Co-scheduled AIC+AIV group |
-| `pl.incore()` *(deprecated)* | `InCore` | Use `pl.at(level=pl.Level.CORE_GROUP)` instead |
-| `pl.auto_incore(split=...)` *(deprecated)* | `AutoInCore` | Use `pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.auto_chunk, pl.split(...)])` |
-| `pl.at(..., optimization=pl.chunked_loop_optimizer[(split=...)])` *(deprecated)* | `AutoInCore` | Use `pl.at(..., optimizations=[pl.auto_chunk, pl.split(...)])` |
-| `pl.at(..., split=...)` *(deprecated)* | `InCore` | Use `pl.at(..., optimizations=[pl.split(...)])` |
+| Form | Produces | Notes |
+| ---- | -------- | ----- |
+| `pl.at(level=pl.Level.CORE_GROUP)` | `HierarchyScopeStmt` (level=CORE_GROUP) | Outlined to `Function(InCore)` by `OutlineIncoreScopes`; parent `Opaque` is promoted to `Orchestration` |
+| `pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.split(MODE)])` | `HierarchyScopeStmt` (level=CORE_GROUP, split=MODE) | Same as above; the split hint is carried on the outlined function and consumed by `ExpandMixedKernel` |
+| `pl.at(level=pl.Level.HOST)` *(or any non-`CORE_GROUP` level)* | `HierarchyScopeStmt` (level=HOST/...) | Outlined to `Function(Opaque)` by `OutlineHierarchyScopes`; parent type preserved |
+| `pl.cluster()` | `ClusterScopeStmt` | Outlined to `Function(Group)` by `OutlineClusterScopes` |
+| `pl.spmd(core_num=N[, sync_start=...])` | `SpmdScopeStmt` | Standalone (non-cluster) spmd is outlined to `Function(Spmd)`; inside a cluster the attrs are hoisted onto the Group function |
 
 See [Language Guide](../../user/01-language_guide.md#incore-scopes) for examples.
 
diff --git a/docs/en/dev/passes/00-pass_manager.md b/docs/en/dev/passes/00-pass_manager.md
index fec4736b3..73851d4db 100644
--- a/docs/en/dev/passes/00-pass_manager.md
+++ b/docs/en/dev/passes/00-pass_manager.md
@@ -33,7 +33,7 @@ Framework for organizing and executing IR transformation passes on Programs with
 | `NoNestedCalls` | No nested call expressions |
 | `NormalizedStmtStructure` | Statement structure normalized |
 | `NoRedundantBlocks` | No single-child or nested SeqStmts |
-| `SplitIncoreOrch` | InCore scopes outlined into separate functions |
+| `HierarchyOutlined` | `HierarchyScopeStmt` regions outlined into functions (`Opaque` for non-CORE_GROUP via `OutlineHierarchyScopes`; `InCore` for `CORE_GROUP` via `OutlineIncoreScopes`); parent re-typed as `Orchestration` when a `CORE_GROUP` scope was outlined. Produced by `OutlineIncoreScopes` (the second of the two outline passes). |
 | `ClusterOutlined` | Cluster scopes outlined into Group functions |
 | `HasMemRefs` | MemRef objects initialized on variables |
 | `IncoreTileOps` | InCore functions use tile ops |
@@ -61,21 +61,20 @@ struct PassProperties {
 | UnrollLoops | TypeChecked | TypeChecked | — |
 | CtrlFlowTransform | TypeChecked | TypeChecked, StructuredCtrlFlow | — |
 | ConvertToSSA | TypeChecked | TypeChecked, SSAForm | NormalizedStmtStructure |
-| FlattenCallExpr | SSAForm | SSAForm, NoNestedCalls | NormalizedStmtStructure |
-| SplitChunkedLoops | TypeChecked, SSAForm | TypeChecked, SSAForm | — |
-| InterchangeChunkLoops | TypeChecked, SSAForm | TypeChecked, SSAForm | — |
 | NormalizeStmtStructure | TypeChecked | TypeChecked, NormalizedStmtStructure | — |
-| OutlineIncoreScopes | TypeChecked, SSAForm | SplitIncoreOrch | — |
+| FlattenCallExpr | SSAForm | SSAForm, NoNestedCalls | NormalizedStmtStructure |
+| OutlineHierarchyScopes | SSAForm | SSAForm | — |
+| OutlineIncoreScopes | SSAForm | SSAForm, HierarchyOutlined | — |
 | OutlineClusterScopes | TypeChecked, SSAForm | ClusterOutlined | — |
-| ConvertTensorToTileOps | SplitIncoreOrch | IncoreTileOps | — |
+| ConvertTensorToTileOps | HierarchyOutlined | IncoreTileOps | — |
 | FlattenTileNdTo2D | SSAForm, IncoreTileOps | SSAForm, TileOps2D | — |
-| ResolveBackendOpLayouts | SSAForm, IncoreTileOps, SplitIncoreOrch, TileOps2D | SSAForm, IncoreTileOps, SplitIncoreOrch, TileOps2D | NormalizedStmtStructure |
-| ExpandMixedKernel | SSAForm, IncoreTileOps, SplitIncoreOrch, TileOps2D | SSAForm, MixedKernelExpanded | — |
-| NormalizeReturnOrder | SplitIncoreOrch, IncoreTileOps | — | — |
-| InitMemRef | TypeChecked, SSAForm, SplitIncoreOrch, IncoreTileOps, TileOps2D | HasMemRefs | SSAForm |
-| MemoryReuse | TypeChecked, SplitIncoreOrch, IncoreTileOps, HasMemRefs, TileOps2D | — | — |
-| InsertSync | TypeChecked, SplitIncoreOrch, IncoreTileOps, HasMemRefs, TileOps2D | — | — |
-| AllocateMemoryAddr | TypeChecked, SplitIncoreOrch, IncoreTileOps, HasMemRefs, TileOps2D | AllocatedMemoryAddr | — |
+| ResolveBackendOpLayouts | SSAForm, IncoreTileOps, HierarchyOutlined, TileOps2D | SSAForm, IncoreTileOps, HierarchyOutlined, TileOps2D | NormalizedStmtStructure |
+| ExpandMixedKernel | SSAForm, IncoreTileOps, HierarchyOutlined, TileOps2D | SSAForm, MixedKernelExpanded | — |
+| NormalizeReturnOrder | HierarchyOutlined, IncoreTileOps | — | — |
+| InitMemRef | TypeChecked, SSAForm, HierarchyOutlined, IncoreTileOps, TileOps2D | HasMemRefs | SSAForm |
+| MemoryReuse | TypeChecked, HierarchyOutlined, IncoreTileOps, HasMemRefs, TileOps2D | — | — |
+| InsertSync | TypeChecked, HierarchyOutlined, IncoreTileOps, HasMemRefs, TileOps2D | — | — |
+| AllocateMemoryAddr | TypeChecked, HierarchyOutlined, IncoreTileOps, HasMemRefs, TileOps2D | AllocatedMemoryAddr | — |
 | FuseCreateAssembleToSlice | — | — | — |
 | Simplify | — | — | — |
 
diff --git a/docs/en/dev/passes/01-unroll_loops.md b/docs/en/dev/passes/01-unroll_loops.md
index cfe582659..8f994c3e5 100644
--- a/docs/en/dev/passes/01-unroll_loops.md
+++ b/docs/en/dev/passes/01-unroll_loops.md
@@ -77,10 +77,10 @@ class After:
 UnrollLoops runs **once** in `Default` and `DebugTileOptimization`, before control flow structuring:
 
 ```text
-UnrollLoops → CtrlFlowTransform → ConvertToSSA → FlattenCallExpr → SplitChunkedLoops → InterchangeChunkLoops → OutlineIncoreScopes → ...
+UnrollLoops → CtrlFlowTransform → ConvertToSSA → NormalizeStmtStructure → FlattenCallExpr → OutlineHierarchyScopes → OutlineIncoreScopes → OutlineClusterScopes → ...
 ```
 
-UnrollLoops expands non-chunked `pl.unroll()` loops (skipping chunked unroll loops which retain `chunk` for `SplitChunkedLoops` to handle later).
+UnrollLoops expands `pl.unroll()` loops into their inline body copies.
 
 ## Pass Properties
 
diff --git a/docs/en/dev/passes/02-ctrl_flow_transform.md b/docs/en/dev/passes/02-ctrl_flow_transform.md
index 4c097a185..4ebf2571f 100644
--- a/docs/en/dev/passes/02-ctrl_flow_transform.md
+++ b/docs/en/dev/passes/02-ctrl_flow_transform.md
@@ -163,7 +163,7 @@ while i < n and not __break_0:
 CtrlFlowTransform runs after UnrollLoops and before ConvertToSSA:
 
 ```text
-UnrollLoops -> CtrlFlowTransform -> ConvertToSSA -> FlattenCallExpr -> SplitChunkedLoops -> ...
+UnrollLoops -> CtrlFlowTransform -> ConvertToSSA -> NormalizeStmtStructure -> FlattenCallExpr -> OutlineHierarchyScopes -> ...
 ```
 
 ## Pass Properties
diff --git a/docs/en/dev/passes/03-convert_to_ssa.md b/docs/en/dev/passes/03-convert_to_ssa.md
index da0db1c03..2a52dd733 100644
--- a/docs/en/dev/passes/03-convert_to_ssa.md
+++ b/docs/en/dev/passes/03-convert_to_ssa.md
@@ -13,7 +13,7 @@ This pass transforms IR with multiple assignments to the same variable into SSA
 
 **Requires**: `TypeChecked` property. `TypeChecked` is verified automatically at BASIC level once produced; use a `VerificationInstrument` via `PassContext` to validate required properties before this pass runs.
 
-**When to use**: Run this pass before any optimization or analysis that requires SSA form (e.g., OutlineIncoreScopes, memory optimization passes).
+**When to use**: Run this pass before any optimization or analysis that requires SSA form (e.g., OutlineHierarchyScopes, memory optimization passes).
 
 ## API
 
diff --git a/docs/en/dev/passes/05-outline_hierarchy_scopes.md b/docs/en/dev/passes/05-outline_hierarchy_scopes.md
new file mode 100644
index 000000000..48b97ca31
--- /dev/null
+++ b/docs/en/dev/passes/05-outline_hierarchy_scopes.md
@@ -0,0 +1,197 @@
+# OutlineHierarchyScopes Pass
+
+Outlines non-`CORE_GROUP` `HierarchyScopeStmt` regions into separate
+`Opaque` functions, carrying the scope's level/role metadata onto the
+outlined function.
+
+## Overview
+
+This pass transforms each `HierarchyScopeStmt` whose `level_` is not
+`Level.CORE_GROUP` into a dedicated `Function` definition and replaces the
+scope with a `Call` to that function. The outlined function is always typed
+`FunctionType::Opaque`; the parent function's type is preserved.
+
+| Scope `level_` | Handled by this pass | Outlined function type | Parent function type after pass |
+| -------------- | -------------------- | ---------------------- | ------------------------------- |
+| `Level.HOST`, `Level.CLUSTER`, `Level.GLOBAL`, ... | Yes | `FunctionType::Opaque` | unchanged (preserved) |
+| `Level.CORE_GROUP` | **No — intentionally left alone** | *(handled by [`OutlineIncoreScopes`](06-outline_incore_scopes.md))* | *(promoted to `Orchestration` by the next pass)* |
+
+`CORE_GROUP` scopes are intentionally left untouched here; the immediately
+following pass, [`OutlineIncoreScopes`](06-outline_incore_scopes.md),
+outlines them into `Function(InCore)` and promotes the parent function from
+`Opaque` to `Orchestration`.
+
+**Requirements**:
+
+- Input IR must be in SSA form (run `ConvertToSSA` first). SSA form is
+  preserved (produced) by this pass.
+- Processes `Opaque` functions. Functions already typed as
+  `Orchestration`, `InCore`, `AIC`, `AIV`, or `Group` are left untouched.
+
+**When to use**: Run after `ConvertToSSA`/`FlattenCallExpr` when the IR
+contains `with pl.at(level=...):` scopes for non-`CORE_GROUP` levels that
+need to be extracted into callable helper functions.
+
+## API
+
+| C++ | Python | Level |
+| --- | ------ | ----- |
+| `pass::OutlineHierarchyScopes()` | `passes.outline_hierarchy_scopes()` | Program-level |
+
+**Factory function**:
+
+```cpp
+Pass OutlineHierarchyScopes();
+```
+
+**Python usage**:
+
+```python
+from pypto.pypto_core import passes
+
+outline_pass = passes.outline_hierarchy_scopes()
+program_outlined = outline_pass(program)
+```
+
+## Algorithm
+
+1. **Scan for Hierarchy Scopes**: Find every `HierarchyScopeStmt` inside each
+   `Opaque` function body whose `level_` is **not** `CORE_GROUP`.
+2. **Analyze Inputs/Outputs**: Use the shared scope-outline helpers to compute
+   the set of variables defined outside but used inside (inputs) and defined
+   inside but used outside (outputs).
+3. **Create Outlined Function**: Extract the scope body into a new `Function`:
+   - Parameters = input variables
+   - Returns = output variables
+   - Body = the scope body
+   - `func_type_` = `Opaque`
+   - Copy `role_` metadata into function attrs.
+4. **Replace the Scope**: Substitute the original `HierarchyScopeStmt` with
+   a `Call` to the outlined function followed by `AssignStmt`s that bind its
+   return values.
+5. **Preserve Parent Type**: The parent function's `func_type_` is not
+   changed by this pass. Parent type promotion for `CORE_GROUP` scopes is
+   the responsibility of [`OutlineIncoreScopes`](06-outline_incore_scopes.md).
+6. **Add to Program**: Prepend the outlined functions to the program's
+   function list.
+
+**Naming**: `{original_func}_{level}_{counter}` (e.g. `main_host_0`,
+`main_global_0`). When `HierarchyScopeStmt.name_hint` is non-empty the hint
+is used directly.
+
+## Example
+
+### Non-CORE_GROUP level (HOST)
+
+**Before**:
+
+```python
+@pl.program
+class Before:
+    @pl.function  # Opaque
+    def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
+        with pl.at(level=pl.Level.HOST):
+            y = helper(x)
+        return y
+```
+
+**After** (parent stays `Opaque`, outlined function is `Opaque`):
+
+```python
+@pl.program
+class After:
+    @pl.function  # unchanged
+    def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
+        y = self.main_host_0(x)
+        return y
+
+    @pl.function  # Opaque
+    def main_host_0(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
+        y = helper(x)
+        return y
+```
+
+### Multiple outputs
+
+```python
+with pl.at(level=pl.Level.HOST):
+    a_tile = pl.load(a, [0], [64])
+    b_tile = pl.load(b, [0], [64])
+    c_tile = pl.add(a_tile, b_tile)
+    out_a = pl.store(c_tile, [0], out)
+    out_b = pl.mul(c_tile, 2.0)
+# both out_a and out_b used after the scope
+x = out_a + out_b
+```
+
+After outlining, the body becomes:
+
+```python
+out_a, out_b = self.main_host_0(a, b, out)  # multiple return values
+x = out_a + out_b
+```
+
+### CORE_GROUP scopes are skipped
+
+```python
+@pl.function  # Opaque
+def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
+    with pl.at(level=pl.Level.CORE_GROUP):   # <-- NOT outlined here
+        tile = pl.load(x, [0], [64])
+        result = pl.store(tile, [0], x)
+    return result
+```
+
+This pass leaves the `CORE_GROUP` scope in place. The next pipeline pass,
+[`OutlineIncoreScopes`](06-outline_incore_scopes.md), outlines it into
+`Function(InCore)` and promotes the parent to `Orchestration`.
+
+## Implementation
+
+**Header**: `include/pypto/ir/transforms/passes.h`
+
+```cpp
+Pass OutlineHierarchyScopes();
+```
+
+**Implementation**: `src/ir/transforms/outline_hierarchy_scopes.cpp`
+
+- Uses the shared `scope_outline_utils` to compute inputs/outputs
+- Builds a new `Function(Opaque)` per non-`CORE_GROUP` scope
+- Copies `role_` metadata onto the outlined function's attrs
+- Never modifies the parent function's `func_type_`
+
+**Python binding**: `python/bindings/modules/passes.cpp`
+
+```cpp
+passes.def("outline_hierarchy_scopes", &pass::OutlineHierarchyScopes,
+           "Outline non-CORE_GROUP HierarchyScopeStmt regions into Opaque functions");
+```
+
+**Tests**: `tests/ut/ir/transforms/test_outline_hierarchy_scopes.py`
+
+- Tests non-`CORE_GROUP` scope → `Opaque` function + parent unchanged
+- Tests that `CORE_GROUP` scopes are left in place
+- Tests input/output analysis
+- Tests multiple non-`CORE_GROUP` scopes in the same parent function
+- Tests SSA preservation
+
+## Pass Properties
+
+| Property | Value |
+| -------- | ----- |
+| Required | `SSAForm` |
+| Produced | `SSAForm` |
+| Invalidated | — |
+
+`HierarchyOutlined` is produced by
+[`OutlineIncoreScopes`](06-outline_incore_scopes.md), which runs next and
+handles the remaining `CORE_GROUP` scopes.
+
+## Pipeline Position
+
+```text
+... → ConvertToSSA → NormalizeStmtStructure → FlattenCallExpr →
+OutlineHierarchyScopes → OutlineIncoreScopes → OutlineClusterScopes →
+ConvertTensorToTileOps → ...
+```
diff --git a/docs/en/dev/passes/05-split_chunked_loops.md b/docs/en/dev/passes/05-split_chunked_loops.md
deleted file mode 100644
index 713f1fa4b..000000000
--- a/docs/en/dev/passes/05-split_chunked_loops.md
+++ /dev/null
@@ -1,187 +0,0 @@
-# SplitChunkedLoops Pass
-
-Splits loops with `chunk` into nested outer/inner loops under one of two policies.
-
-## Overview
-
-This pass transforms a for loop created with `chunk=C` into a pair of nested loops: an outer loop over chunk indices and an inner loop iterating within each chunk. Two codegen policies are supported:
-
-- **`guarded`** (default) — emit a single outer loop of `ceil(T/C)` chunks plus an inner loop of `C`, and wrap the body in `if (idx < stop)` (or `idx > stop` for negative step). Out-of-range iterations become no-ops. A single kernel is emitted.
-- **`leading_full`** — emit a full-chunk loop of `T/C` chunks plus a separate remainder loop of `T % C` iterations. Two sibling loops are emitted.
-
-Both policies run after SSA conversion and propagate `iter_args` through the generated loops.
-
-**Requires**: `TypeChecked`, `SSAForm`.
-
-**When to use**: Runs automatically in the default pipeline after `FlattenCallExpr` and before `InterchangeChunkLoops`. Use `chunk=` on `pl.range()`, `pl.parallel()`, or `pl.unroll()` inside a `with pl.auto_incore():` scope. Chunked loops outside `auto_incore` are not split.
-
-## API
-
-| C++ | Python | Level |
-| --- | ------ | ----- |
-| `pass::SplitChunkedLoops()` | `passes.split_chunked_loops()` | Function-level |
-
-```python
-from pypto import passes
-result = passes.split_chunked_loops()(program)
-```
-
-## DSL Syntax
-
-Chunked loops must be wrapped in `with pl.auto_incore():`:
-
-```python
-with pl.auto_incore():
-    # Default (guarded): single kernel with if-guard
-    for i in pl.range(10, chunk=5):
-        x = pl.add(x, 1.0)
-
-    # Explicit guarded (same as default)
-    for i in pl.parallel(n, chunk=4, chunk_policy="guarded"):
-        x = pl.add(x, 1.0)
-
-    # Explicit leading_full: peels remainder into separate loop
-    for i in pl.range(7, chunk=5, chunk_policy="leading_full"):
-        x = pl.add(x, 1.0)
-
-    # iter_args are supported under both policies
-    for i, (s,) in pl.range(10, init_values=(x,), chunk=5):
-        s = pl.add(s, 1.0)
-        s = pl.yield_(s)
-```
-
-## Choosing a Policy
-
-| Criterion | Prefer `guarded` | Prefer `leading_full` |
-| --------- | ---------------- | --------------------- |
-| Dynamic bound (`stop` not a compile-time constant) | ✅ — single kernel preserves loop-carried state across the boundary | ❌ — remainder kernel receives iter_args as input-only copies, breaking cross-iteration accumulation |
-| Static bound, trip_count known divisible | Slightly redundant guard | ✅ — no guard, no remainder |
-| Want minimum kernel count under `pl.auto_incore()` | ✅ | Produces 2 kernels per chunked loop |
-| Want to eliminate masked iterations inside the hot loop | ❌ | ✅ — full chunks run unconditionally |
-
-`guarded` is the default because (1) it preserves `add_inout()` accumulation under dynamic bounds and (2) it avoids doubling the kernel count under `pl.auto_incore()`.
-
-## Constraints
-
-| Constraint | Reason |
-| ---------- | ------ |
-| `step`, `chunk` must be integer constants | Needed at compile time |
-| `chunk` must be a positive integer | Non-positive sizes are invalid |
-| `step` may be negative (descending loop) | `guarded` adapts the predicate to the step sign |
-| `start`, `stop` may be dynamic expressions under `guarded` | Trip count becomes `max(abs(stop - start), 0) / abs(step)` |
-| Chunked loop must be inside `pl.auto_incore()` | Only `auto_incore`-scoped loops are split |
-| `chunk` may be combined with `init_values` | Both policies thread iter_args through the generated loops |
-
-## Algorithm
-
-Let `T = ceil(max(|stop - start|, 0) / |step|)` and `C = chunk`.
-
-### `guarded` (default)
-
-1. `n_total = ceil(T / C)` — static when bounds are const, otherwise `(T + C - 1) // C`.
-2. Emit outer loop `for out_var in [0, n_total)` and inner loop `for in_var in [0, C)`.
-3. Compute `idx = start + (out_var * C + in_var) * step` (substituted into body).
-4. Wrap the visited body in an `IfStmt` whose condition is:
-   - `idx < stop` when `step > 0`
-   - `idx > stop` when `step < 0`
-5. **Without iter_args** — IfStmt has no else branch; skipped iterations are no-ops.
-6. **With iter_args** — IfStmt gets `return_vars` acting as phi nodes: the then-branch keeps the user body's trailing `YieldStmt` (updated values), the else-branch yields the inner iter_args unchanged. The inner loop's trailing `YieldStmt` references the IfStmt's phi vars, so loop-carried state threads through both guarded and skipped iterations.
-
-### `leading_full`
-
-1. `n_full = T // C`, `n_rem = T % C`.
-2. Emit outer loop `for out_var in [0, n_full)` and inner loop `for in_var in [0, C)` with `idx = start + (out_var * C + in_var) * step`. Skip if `n_full == 0`.
-3. If `n_rem > 0`, emit a remainder loop `for rem_var in [0, n_rem)` with `idx = start + (n_full * C + rem_var) * step`. Its `init_values` chain from the outer loop's `return_vars` (or from the original init if no full-chunk loop was emitted).
-4. Remap the original `return_vars` to the final loop's `return_vars`.
-
-Both paths preserve the original `ForKind` (Sequential, Parallel, or Unroll) on inner and outer/remainder loops.
-
-## Auto-Name Abbreviations
-
-Printed IR uses the compact auto-name format `base__qualifier_role_vN`. Abbreviated qualifiers:
-
-| Abbreviation | Meaning | Emitted by |
-| ------------ | ------- | ---------- |
-| `co` | chunk_outer | both policies |
-| `ci` | chunk_inner | both policies |
-| `cr` | chunk_rem (remainder) | `leading_full` only |
-| `cg` | chunk_guard (IfStmt phi) | `guarded` with iter_args only |
-
-Examples: `i__co_idx_v0` (outer index), `x__ci_iter_v1` (inner iter_arg), `x__cr_rv_v1` (remainder return var), `x__cg_rv_v1` (IfStmt phi var).
-
-## Examples
-
-### `guarded`, divisible (`chunk=5`, trip_count=10)
-
-**After**:
-
-```python
-for i__co_idx_v0, (x__co_iter_v1,) in pl.range(2, init_values=(x__ssa_v0,)):
-    for i__ci_idx_v0, (x__ci_iter_v1,) in pl.range(5, init_values=(x__co_iter_v1,)):
-        if i__co_idx_v0 * 5 + i__ci_idx_v0 < 10:
-            x__ssa_v3 = pl.tensor.add(x__ci_iter_v1, 1.0)
-            x__cg_rv_v1 = pl.yield_(x__ssa_v3)
-        else:
-            x__cg_rv_v1 = pl.yield_(x__ci_iter_v1)
-        x__ci_rv_v1 = pl.yield_(x__cg_rv_v1)
-    x__co_rv_v1 = pl.yield_(x__ci_rv_v1)
-return x__co_rv_v1
-```
-
-### `guarded`, dynamic bound (`chunk=4`, `stop=n`)
-
-**After** (single kernel, `n_total = (n + 3) // 4`):
-
-```python
-for i__co_idx_v0, (x__co_iter_v1,) in pl.range((n + 3) // 4, init_values=(x__ssa_v0,)):
-    for i__ci_idx_v0, (x__ci_iter_v1,) in pl.range(4, init_values=(x__co_iter_v1,)):
-        if i__co_idx_v0 * 4 + i__ci_idx_v0 < n:
-            x__ssa_v3 = pl.tensor.add(x__ci_iter_v1, 1.0)
-            x__cg_rv_v1 = pl.yield_(x__ssa_v3)
-        else:
-            x__cg_rv_v1 = pl.yield_(x__ci_iter_v1)
-        x__ci_rv_v1 = pl.yield_(x__cg_rv_v1)
-    x__co_rv_v1 = pl.yield_(x__ci_rv_v1)
-return x__co_rv_v1
-```
-
-### `leading_full`, non-divisible (`chunk=5`, trip_count=7)
-
-**After** (two sibling loops):
-
-```python
-for i__co_idx_v0, (x__co_iter_v1,) in pl.range(1, init_values=(x__ssa_v0,)):
-    for i__ci_idx_v0, (x__ci_iter_v1,) in pl.range(5, init_values=(x__co_iter_v1,)):
-        x__ssa_v3 = pl.tensor.add(x__ci_iter_v1, 1.0)
-        x__ci_rv_v1 = pl.yield_(x__ssa_v3)
-    x__co_rv_v1 = pl.yield_(x__ci_rv_v1)
-for i__cr_idx_v0, (x__cr_iter_v1,) in pl.range(2, init_values=(x__co_rv_v1,)):
-    x__ssa_v4 = pl.tensor.add(x__cr_iter_v1, 1.0)
-    x__cr_rv_v1 = pl.yield_(x__ssa_v4)
-return x__cr_rv_v1
-```
-
-## LoopOrigin Tagging
-
-| LoopOrigin | Description | Emitted by |
-| ---------- | ----------- | ---------- |
-| `Original` | Regular user loop (default) | — |
-| `ChunkOuter` | Outer loop over chunk indices | both policies |
-| `ChunkInner` | Inner loop within a chunk | both policies |
-| `ChunkRemainder` | Remainder loop for leftover iterations | `leading_full` only |
-
-Access via `for_stmt.attrs.get("loop_origin")` (Python) or `for_stmt->GetAttr<LoopOrigin>("loop_origin")` (C++).
-
-## Pipeline Position
-
-```text
-UnrollLoops → ConvertToSSA → FlattenCallExpr → SplitChunkedLoops → InterchangeChunkLoops → OutlineIncoreScopes → ...
-```
-
-## Pass Properties
-
-| Property | Value |
-| -------- | ----- |
-| Required | `TypeChecked`, `SSAForm` |
-| Produced | `TypeChecked`, `SSAForm` |
-| Invalidated | (none) |
diff --git a/docs/en/dev/passes/06-interchange_chunk_loops.md b/docs/en/dev/passes/06-interchange_chunk_loops.md
deleted file mode 100644
index b6547b6af..000000000
--- a/docs/en/dev/passes/06-interchange_chunk_loops.md
+++ /dev/null
@@ -1,197 +0,0 @@
-# InterchangeChunkLoops Pass
-
-Reorders nested ChunkOuter/ChunkInner loop pairs and inserts `InCore` scopes for downstream outlining.
-
-## Overview
-
-After `SplitChunkedLoops` splits chunked loops into nested `ChunkOuter→ChunkInner` pairs, the structure for nested chunked loops is:
-
-```text
-i_out[ChunkOuter] → i_in[ChunkInner,Parallel] → j_out[ChunkOuter] → j_in[ChunkInner,Parallel] → body
-```
-
-This pass reorders so all outer loops are on top and wraps the inner loops + body in `InCoreScopeStmt`:
-
-```text
-i_out[ChunkOuter] → j_out[ChunkOuter] → InCore{ i_in[ChunkInner] → j_in[ChunkInner] → body }
-```
-
-**Requires**: TypeChecked, SSAForm properties.
-
-**When to use**: Runs automatically in the default pipeline after `SplitChunkedLoops` and before `OutlineIncoreScopes`. Only operates on loops inside `pl.auto_incore()` scope. The `AutoInCore` scope is consumed (removed) by this pass.
-
-## API
-
-| C++ | Python | Level |
-| --- | ------ | ----- |
-| `pass::InterchangeChunkLoops()` | `passes.interchange_chunk_loops()` | Function-level |
-
-**Python usage**:
-
-```python
-from pypto import passes
-
-result = passes.interchange_chunk_loops()(program)
-```
-
-## Constraints
-
-| Constraint | Behavior |
-| ---------- | -------- |
-| SSA-only | Runs after `SplitChunkedLoops` (requires `SSAForm`) |
-| Parallel-only interchange | Only interchanges when ALL ChunkInner loops have `ForKind::Parallel` |
-| Sequential chunked loops | Not interchanged, but wrapped in InCore if inside `auto_incore` |
-| Existing InCore | If chain body already contains `InCoreScopeStmt`, skip |
-| Requires `auto_incore` scope | Only loops inside `AutoInCoreScopeStmt` are processed; the scope is consumed |
-
-## Algorithm
-
-1. **Collect chain** — Starting from a `ChunkOuter` ForStmt, walk into nested ForStmt body. Build list of `(ForStmt, LoopOrigin)` entries. Stop at non-ForStmt, `Original` loop, or `ScopeStmt`.
-
-2. **Guard checks** — Verify all ChunkInner loops are Parallel. Check no existing InCore scope in innermost body.
-
-3. **Separate** — Split chain into `outers` (ChunkOuter) and `inners` (ChunkInner).
-
-4. **Reconstruct** (inside-out build):
-   - Visit the innermost body
-   - Wrap inners around body (preserving order), reconnecting iter_args
-   - Wrap in `InCoreScopeStmt`
-   - Wrap outers around InCore (preserving order), reconnecting iter_args and yields
-
-5. **Handle remainders** — `ChunkRemainder` loops: recurse into body. Wrap standalone parallel remainder sub-loops in InCore.
-
-## Auto-Name Abbreviations
-
-The examples below use compact qualifiers inside `base__qualifier_role_vN` names:
-
-| Abbreviation | Meaning |
-| ------------ | ------- |
-| `co` | `chunk_outer` |
-| `ci` | `chunk_inner` |
-| `cr` | `chunk_rem` / chunk remainder |
-| `lN` | interchange loop level `N` |
-
-Examples:
-
-- `x__co_iter_v1` = chunk-outer iter_arg before interchange
-- `x__co_l0_iter_v1` = loop-threaded iter_arg after interchange, level 0
-- `x__co_l2_rv_v1` = return var flowing out of reordered level 2
-
-Roles such as `iter`, `rv`, `idx`, and `ssa` remain unabridged so the variable's purpose stays obvious.
-
-## Example
-
-**Before** (after SplitChunkedLoops, all parallel):
-
-```python
-for i__co_idx_v0, (x__co_iter_v1,) in pl.range(2, init_values=(x__ssa_v0,)):  # ChunkOuter
-    for i__ci_idx_v0, (x__ci_iter_v1,) in pl.parallel(
-        4, init_values=(x__co_iter_v1,)
-    ):  # ChunkInner
-        for j__co_idx_v0, (y__co_iter_v1,) in pl.range(
-            3, init_values=(x__ci_iter_v1,)
-        ):  # ChunkOuter
-            for j__ci_idx_v0, (y__ci_iter_v1,) in pl.parallel(
-                4, init_values=(y__co_iter_v1,)
-            ):  # ChunkInner
-                z = pl.add(y__ci_iter_v1, 1.0)
-                y__ci_rv_v1 = pl.yield_(z)
-            y__co_rv_v1 = pl.yield_(y__ci_rv_v1)
-        x__ci_rv_v1 = pl.yield_(y__co_rv_v1)
-    x__co_rv_v1 = pl.yield_(x__ci_rv_v1)
-return x__co_rv_v1
-```
-
-**After** (InterchangeChunkLoops):
-
-```python
-for i__co_idx_v0, (x__co_l0_iter_v1,) in pl.range(
-    2, init_values=(x__ssa_v0,)
-):  # ChunkOuter
-    for j__co_idx_v0, (x__co_l1_iter_v1,) in pl.range(
-        3, init_values=(x__co_l0_iter_v1,)
-    ):  # ChunkOuter
-        with pl.incore():                                               # InCore inserted
-            for i__ci_idx_v0, (x__co_l2_iter_v1,) in pl.parallel(
-                4, init_values=(x__co_l1_iter_v1,)
-            ):  # ChunkInner
-                for j__ci_idx_v0, (x__co_l3_iter_v1,) in pl.parallel(
-                    4, init_values=(x__co_l2_iter_v1,)
-                ):  # ChunkInner
-                    z = pl.add(x__co_l3_iter_v1, 1.0)
-                    x__co_l3_rv_v1 = pl.yield_(z)
-                x__co_l2_rv_v1 = pl.yield_(x__co_l3_rv_v1)
-        x__co_l1_rv_v1 = pl.yield_(x__co_l2_rv_v1)
-    x__co_l0_rv_v1 = pl.yield_(x__co_l1_rv_v1)
-return x__co_l0_rv_v1
-```
-
-## Remainder Handling
-
-For non-divisible trip counts, remainder loops get InCore wrapping:
-
-```python
-for i_rem, (...) in pl.parallel(2, init_values=(...)):   # ChunkRemainder
-    for j_out, (...) in pl.range(3, init_values=(...)):   # Interchange applied
-        with pl.incore():
-            for j_in, (...) in pl.parallel(4, init_values=(...)):
-                body
-    with pl.incore():                                            # Remainder wrapped
-        for j_rem, (...) in pl.parallel(2, init_values=(...)):
-            body
-```
-
-## Non-Chunk Statement Handling
-
-When `auto_incore` is consumed, statements that were not handled by chunk interchange (standalone tensor ops, non-chunked loops, sequential chunked loops that failed the parallel guard) are wrapped in `InCoreScopeStmt` to ensure they get outlined into InCore functions by `OutlineIncoreScopes`.
-
-Consecutive non-InCore statements are grouped into a single `InCoreScopeStmt`. Control flow statements (`YieldStmt`, `ReturnStmt`) and pure scalar assignments (e.g., index arithmetic like `offset = ob * 32`) are never wrapped — they stay in the orchestration scope.
-
-**Example** — standalone op + parallel chunk:
-
-```python
-# Before (inside auto_incore, after SplitChunkedLoops)
-with pl.auto_incore():
-    x = pl.add(x, 1.0)                           # standalone op
-    for i_out in pl.range(2):                     # ChunkOuter (parallel inner)
-        for i_in in pl.parallel(4):
-            x = pl.add(x, 2.0)
-
-# After InterchangeChunkLoops
-with pl.incore():                                 # standalone wrapped
-    x = pl.add(x, 1.0)
-for i_out in pl.range(2):                         # interchanged chunk
-    with pl.incore():
-        for i_in in pl.parallel(4):
-            x = pl.add(x, 2.0)
-```
-
-**Example** — sequential chunk (fails interchange guard):
-
-```python
-# Before
-with pl.auto_incore():
-    for i_out in pl.range(2):                     # ChunkOuter (sequential inner)
-        for i_in in pl.range(4):                  # ChunkInner, Sequential → fails guard
-            x = pl.add(x, 1.0)
-
-# After — entire chain wrapped in InCore
-with pl.incore():
-    for i_out in pl.range(2):
-        for i_in in pl.range(4):
-            x = pl.add(x, 1.0)
-```
-
-## Pipeline Position
-
-```text
-UnrollLoops → ConvertToSSA → FlattenCallExpr → SplitChunkedLoops → InterchangeChunkLoops → OutlineIncoreScopes → ...
-```
-
-## Pass Properties
-
-| Property | Value |
-| -------- | ----- |
-| Required | `TypeChecked`, `SSAForm` |
-| Produced | `TypeChecked`, `SSAForm` |
-| Invalidated | (none) |
diff --git a/docs/en/dev/passes/06-outline_incore_scopes.md b/docs/en/dev/passes/06-outline_incore_scopes.md
new file mode 100644
index 000000000..ec1f5c1a8
--- /dev/null
+++ b/docs/en/dev/passes/06-outline_incore_scopes.md
@@ -0,0 +1,228 @@
+# OutlineIncoreScopes Pass
+
+Outlines `HierarchyScopeStmt` regions with `level_ == CORE_GROUP` into
+dedicated `Function(InCore)` definitions and promotes the enclosing parent
+function from `Opaque` to `Orchestration`.
+
+## Overview
+
+This pass specifically targets the `CORE_GROUP` form of
+`HierarchyScopeStmt` — the per-core-group kernel region introduced by
+`with pl.at(level=pl.Level.CORE_GROUP):`. Each such scope is extracted
+into a new `Function` whose `func_type_` is `FunctionType::InCore`, and the
+original scope is replaced with a `Call` to that outlined function. Whenever
+any `CORE_GROUP` scope is outlined out of a given parent function, that
+parent's `func_type_` is promoted from `Opaque` to `Orchestration`.
+
+This pass is the CORE_GROUP counterpart of
+[`OutlineHierarchyScopes`](05-outline_hierarchy_scopes.md), which handles
+the remaining (non-CORE_GROUP) hierarchy levels by emitting
+`Function(Opaque)` and leaving the parent type alone.
+
+| Scope `level_` | Outlined function type | Parent function type after pass |
+| -------------- | ---------------------- | ------------------------------- |
+| `Level.CORE_GROUP` | `FunctionType::InCore` | promoted `Opaque` → `Orchestration` |
+| any other level | *(not handled — already outlined by `OutlineHierarchyScopes`)* | — |
+
+When a `CORE_GROUP` scope carries a `split_` optimization hint, the hint is
+attached to the outlined `InCore` function as a `split` attribute so that
+downstream passes — notably
+[`ExpandMixedKernel`](11-expand_mixed_kernel.md) — can honour it when
+deciding how to split the kernel into AIC / AIV halves.
+
+**Requirements**:
+
+- Input IR must be in SSA form (run `ConvertToSSA` first). SSA form is
+  preserved (produced) by this pass.
+- Expects `OutlineHierarchyScopes` to have already run, so only
+  `CORE_GROUP` `HierarchyScopeStmt` nodes remain to be outlined.
+- Only processes `Opaque` functions (which may contain residual
+  `CORE_GROUP` scopes). Functions already typed as `Orchestration`,
+  `InCore`, `AIC`, `AIV`, or `Group` are left untouched.
+
+**When to use**: Run immediately after
+[`OutlineHierarchyScopes`](05-outline_hierarchy_scopes.md) and before
+[`OutlineClusterScopes`](07-outline_cluster_scopes.md). By the time this
+pass finishes, the `HierarchyOutlined` property holds: no
+`HierarchyScopeStmt` nodes remain in `Opaque` or `Orchestration` functions.
+
+## API
+
+| C++ | Python | Level |
+| --- | ------ | ----- |
+| `pass::OutlineIncoreScopes()` | `passes.outline_incore_scopes()` | Program-level |
+
+**Factory function**:
+
+```cpp
+Pass OutlineIncoreScopes();
+```
+
+**Python usage**:
+
+```python
+from pypto.pypto_core import passes
+
+outline_pass = passes.outline_incore_scopes()
+program_outlined = outline_pass(program)
+```
+
+## Algorithm
+
+1. **Scan for CORE_GROUP Scopes**: Find every `HierarchyScopeStmt` in each
+   `Opaque` function body whose `level_ == CORE_GROUP`.
+2. **Analyze Inputs/Outputs**: Use the shared scope-outline helpers to
+   compute the set of variables defined outside but used inside (inputs)
+   and defined inside but used outside (outputs).
+3. **Create Outlined InCore Function**: Extract the scope body into a new
+   `Function`:
+   - Parameters = input variables
+   - Returns = output variables
+   - Body = the scope body
+   - `func_type_` = `InCore`
+   - Copy `role_` into function attrs.
+   - If the scope carries a `split_` optimization hint, copy it into the
+     function's `split` attr (consumed by `ExpandMixedKernel`).
+4. **Replace the Scope**: Substitute the original `HierarchyScopeStmt`
+   with a `Call` to the outlined InCore function followed by `AssignStmt`s
+   that bind its return values.
+5. **Promote Parent**: If any `CORE_GROUP` scope was outlined from the
+   parent function, re-type that parent from `Opaque` to `Orchestration`.
+6. **Add to Program**: Prepend the outlined InCore functions to the
+   program's function list.
+
+**Naming**: `{original_func}_core_group_{counter}` (e.g.
+`main_core_group_0`). Outlined InCore functions use a `_incore_`-style
+name suffix in their attrs and are easily identifiable in printed IR. When
+`HierarchyScopeStmt.name_hint` is non-empty the hint is used directly.
+
+## Example
+
+### CORE_GROUP → InCore + Orchestration
+
+**Before** (after `OutlineHierarchyScopes`, non-CORE_GROUP scopes are
+already outlined; the CORE_GROUP scope still sits inline in `main`):
+
+```python
+@pl.program
+class Before:
+    @pl.function  # Opaque
+    def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
+        y = x + 1
+
+        with pl.at(level=pl.Level.CORE_GROUP):
+            tile = pl.load(y, [0], [64])
+            tile_sq = pl.mul(tile, tile)
+            result_tile = tile_sq + 1
+            result = pl.store(result_tile, [0], x)
+
+        z = result + 2
+        return z
+```
+
+**After**:
+
+```python
+@pl.program
+class After:
+    @pl.function(type=pl.FunctionType.Orchestration)  # promoted
+    def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
+        y = x + 1
+        result = self.main_core_group_0(y, x)  # Call to outlined InCore fn
+        z = result + 2
+        return z
+
+    @pl.function(type=pl.FunctionType.InCore)  # outlined
+    def main_core_group_0(self, y: pl.Tensor[[64], pl.FP32],
+                          x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
+        tile = pl.load(y, [0], [64])
+        tile_sq = pl.mul(tile, tile)
+        result_tile = tile_sq + 1
+        result = pl.store(result_tile, [0], x)
+        return result
+```
+
+### CORE_GROUP with split hint
+
+```python
+with pl.at(level=pl.Level.CORE_GROUP,
+           optimizations=[pl.split(pl.SplitMode.UP_DOWN)]):
+    ...
+```
+
+The outlined `InCore` function receives the `split` hint in its attrs,
+which `ExpandMixedKernel` later reads to split the kernel into AIC + AIV
+halves.
+
+### Multiple outputs
+
+```python
+with pl.at(level=pl.Level.CORE_GROUP):
+    a_tile = pl.load(a, [0], [64])
+    b_tile = pl.load(b, [0], [64])
+    c_tile = pl.add(a_tile, b_tile)
+    out_a = pl.store(c_tile, [0], out)
+    out_b = pl.mul(c_tile, 2.0)
+# both out_a and out_b used after the scope
+x = out_a + out_b
+```
+
+After outlining, the parent body becomes:
+
+```python
+out_a, out_b = self.main_core_group_0(a, b, out)  # multiple return values
+x = out_a + out_b
+```
+
+## Implementation
+
+**Header**: `include/pypto/ir/transforms/passes.h`
+
+```cpp
+Pass OutlineIncoreScopes();
+```
+
+**Implementation**: `src/ir/transforms/outline_incore_scopes.cpp`
+
+- Uses the shared `scope_outline_utils` to compute inputs/outputs
+- Builds a new `Function(InCore)` per `CORE_GROUP` scope
+- Copies `role_` / `split_` metadata onto the outlined function's attrs
+- Re-types the parent function from `Opaque` to `Orchestration` when at
+  least one `CORE_GROUP` scope was outlined out of it
+
+**Python binding**: `python/bindings/modules/passes.cpp`
+
+```cpp
+passes.def("outline_incore_scopes", &pass::OutlineIncoreScopes,
+           "Outline CORE_GROUP HierarchyScopeStmt regions into Function(InCore) "
+           "and promote the parent function to Orchestration");
+```
+
+**Tests**: `tests/ut/ir/transforms/test_outline_incore_scopes.py`
+
+- Tests `CORE_GROUP` scope → `InCore` function + parent `Orchestration`
+- Tests `split_` propagation onto the outlined InCore function
+- Tests input/output analysis
+- Tests multiple `CORE_GROUP` scopes in the same parent function
+- Tests SSA preservation
+
+## Pass Properties
+
+| Property | Value |
+| -------- | ----- |
+| Required | `SSAForm` |
+| Produced | `SSAForm`, `HierarchyOutlined` |
+| Invalidated | — |
+
+`HierarchyOutlined` is produced here (not by
+[`OutlineHierarchyScopes`](05-outline_hierarchy_scopes.md)): after both
+outline passes have run, no `HierarchyScopeStmt` nodes remain in
+`Opaque`/`Orchestration` functions.
+
+## Pipeline Position
+
+```text
+... → ConvertToSSA → NormalizeStmtStructure → FlattenCallExpr →
+OutlineHierarchyScopes → OutlineIncoreScopes → OutlineClusterScopes →
+ConvertTensorToTileOps → ...
+```
diff --git a/docs/en/dev/passes/08-outline_cluster_scopes.md b/docs/en/dev/passes/07-outline_cluster_scopes.md
similarity index 72%
rename from docs/en/dev/passes/08-outline_cluster_scopes.md
rename to docs/en/dev/passes/07-outline_cluster_scopes.md
index c4cd839cb..c16ff83f2 100644
--- a/docs/en/dev/passes/08-outline_cluster_scopes.md
+++ b/docs/en/dev/passes/07-outline_cluster_scopes.md
@@ -11,7 +11,7 @@ This pass transforms `ClusterScopeStmt` nodes into separate `Function(Group)` de
 - Input IR must be in SSA form (run ConvertToSSA first)
 - Only processes Opaque and Orchestration functions
 
-**When to use**: Run after `OutlineIncoreScopes` when the IR contains `with pl.cluster():` scopes or standalone `with pl.spmd(...):` scopes that need to be extracted into wrapper functions.
+**When to use**: Run after `OutlineHierarchyScopes` and `OutlineIncoreScopes` when the IR contains `with pl.cluster():` scopes or standalone `with pl.spmd(...):` scopes that need to be extracted into wrapper functions. The cluster body may still contain calls to `Function(InCore)` produced earlier by `OutlineIncoreScopes`.
 
 ## API
 
@@ -42,7 +42,9 @@ program_outlined = outline_pass(program)
 
 ## Example
 
-**Before**:
+**Before** (assume `OutlineIncoreScopes` has already turned the inner
+`with pl.at(level=pl.Level.CORE_GROUP): ...` scope into a call to an outlined
+`Function(InCore)` named `main_core_group_0`):
 
 ```python
 @pl.program
@@ -50,8 +52,7 @@ class Before:
     @pl.function
     def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
         with pl.cluster():
-            with pl.incore():
-                y: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
+            y: pl.Tensor[[64], pl.FP32] = self.main_core_group_0(x)
         return y
 ```
 
@@ -62,8 +63,7 @@ class Before:
 class After:
     @pl.function(type=pl.FunctionType.Group)
     def main_cluster_0(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-        with pl.incore():
-            y: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
+        y: pl.Tensor[[64], pl.FP32] = self.main_core_group_0(x)
         return y
 
     @pl.function
@@ -72,7 +72,10 @@ class After:
         return y
 ```
 
-Note: InCore scopes inside the Cluster are preserved in the outlined Group function. Run `OutlineIncoreScopes` first to outline InCore scopes before clustering, or after to outline them within Group functions.
+Note: `OutlineHierarchyScopes` and `OutlineIncoreScopes` run before this
+pass, so the cluster body already contains calls to `Function(InCore)`
+rather than inline `HierarchyScopeStmt` nodes. The outlined Group function
+preserves those calls.
 
 ## Standalone Spmd Example
 
@@ -133,12 +136,12 @@ class After:
 | Produced | SSAForm, ClusterOutlined |
 | Invalidated | — |
 
-## Relationship to OutlineIncoreScopes
+## Relationship to OutlineHierarchyScopes / OutlineIncoreScopes
 
-| Aspect | OutlineIncoreScopes | OutlineClusterScopes |
-| ------ | ------------------- | -------------------- |
-| Scope kind | `ScopeKind::InCore` | `ScopeKind::Cluster` / standalone `ScopeKind::Spmd` |
-| Output function type | `FunctionType::InCore` | `FunctionType::Group` / `FunctionType::Spmd` |
-| Naming pattern | `{func}_incore_{n}` | `{func}_cluster_{n}` / `{func}_spmd_{n}` |
-| Promotes parent to | Orchestration | *(unchanged)* |
-| Processes | Opaque functions only | Opaque + Orchestration |
+| Aspect | OutlineHierarchyScopes | OutlineIncoreScopes | OutlineClusterScopes |
+| ------ | ---------------------- | ------------------- | -------------------- |
+| Scope kind | `HierarchyScopeStmt` (non-CORE_GROUP) | `HierarchyScopeStmt` (CORE_GROUP) | `ClusterScopeStmt` / standalone `SpmdScopeStmt` |
+| Output function type | `FunctionType::Opaque` | `FunctionType::InCore` | `FunctionType::Group` / `FunctionType::Spmd` |
+| Naming pattern | `{func}_{level}_{n}` | `{func}_core_group_{n}` | `{func}_cluster_{n}` / `{func}_spmd_{n}` |
+| Promotes parent to | *(unchanged)* | `Orchestration` | *(unchanged)* |
+| Processes | `Opaque` functions only | `Opaque` functions only | `Opaque` + `Orchestration` |
diff --git a/docs/en/dev/passes/07-outline_incore_scopes.md b/docs/en/dev/passes/07-outline_incore_scopes.md
deleted file mode 100644
index 269df4d50..000000000
--- a/docs/en/dev/passes/07-outline_incore_scopes.md
+++ /dev/null
@@ -1,173 +0,0 @@
-# OutlineIncoreScopes Pass
-
-Outlines InCore scopes into separate functions.
-
-## Overview
-
-This pass transforms `InCoreScopeStmt` nodes into separate `Function(InCore)` definitions and replaces the scope with a Call to the outlined function.
-
-**Requirements**:
-
-- Input IR must be in SSA form (run ConvertToSSA first); SSAForm is preserved (produced) by this pass
-- Only processes Opaque functions (InCore functions are left unchanged)
-
-**When to use**: Run after ConvertToSSA when you need to extract InCore computation regions into separate callable functions.
-
-## API
-
-| C++ | Python | Level |
-| --- | ------ | ----- |
-| `pass::OutlineIncoreScopes()` | `passes.outline_incore_scopes()` | Program-level |
-
-**Factory function**:
-
-```cpp
-Pass OutlineIncoreScopes();
-```
-
-**Python usage**:
-
-```python
-from pypto.pypto_core import passes
-
-outline_pass = passes.outline_incore_scopes()
-program_outlined = outline_pass(program)
-```
-
-## Algorithm
-
-1. **Scan for InCore Scopes**: Find all `InCoreScopeStmt` nodes in Opaque functions
-2. **Analyze Inputs**: Determine external variable references (variables defined outside scope, used inside)
-3. **Analyze Outputs**: Determine internal definitions used after scope (variables defined inside, used outside)
-4. **Create Function**: Extract scope body into new `Function(scope_type=InCore)` with:
-   - Parameters = input variables
-   - Returns = output variables
-   - Body = scope body
-5. **Replace Scope**: Replace `InCoreScopeStmt` with:
-   - Call to outlined function with input arguments
-   - AssignStmt for each output variable
-6. **Add to Program**: Add outlined function to program's function list
-
-**Naming**:
-
-- Default: `{original_func}_incore_{counter}` (e.g., `main_incore_0`, `main_incore_1`)
-- User-provided: when `InCoreScopeStmt.name_hint` is non-empty, that name is used directly
-  - `with pl.incore(name_hint="fused_add"):` → function named `fused_add`
-
-## Example
-
-### Basic Outlining
-
-**Before**:
-
-```python
-@pl.program
-class Before:
-    @pl.function  # Opaque function
-    def main(self, x: Tensor[[64], FP32]) -> Tensor[[64], FP32]:
-        y = x + 1
-
-        with pl.incore():  # InCore scope
-            tile = pl.load(y, [0], [64])
-            tile_sq = pl.mul(tile, tile)
-            result_tile = tile_sq + 1
-            result = pl.store(result_tile, [0], x)
-
-        z = result + 2
-        return z
-```
-
-**After**:
-
-```python
-@pl.program
-class After:
-    @pl.function  # Opaque function
-    def main(self, x: Tensor[[64], FP32]) -> Tensor[[64], FP32]:
-        y = x + 1
-
-        # Scope replaced with call + assignments
-        result = self.main_incore_0(y, x)  # Call outlined function
-
-        z = result + 2
-        return z
-
-    @pl.function(scope_type=InCore)  # Outlined InCore function
-    def main_incore_0(self, y: Tensor[[64], FP32], x: Tensor[[64], FP32]) -> Tensor[[64], FP32]:
-        # Scope body moved here
-        tile = pl.load(y, [0], [64])
-        tile_sq = pl.mul(tile, tile)
-        result_tile = tile_sq + 1
-        result = pl.store(result_tile, [0], x)
-        return result
-```
-
-### Multiple Outputs
-
-**Before**:
-
-```python
-with pl.incore():
-    a_tile = pl.load(a, [0], [64])
-    b_tile = pl.load(b, [0], [64])
-    c_tile = pl.add(a_tile, b_tile)
-    out_a = pl.store(c_tile, [0], out)
-    out_b = pl.mul(c_tile, 2.0)
-# Both out_a and out_b used after scope
-x = out_a + out_b
-```
-
-**After**:
-
-```python
-out_a, out_b = self.main_incore_0(a, b, out)  # Multiple outputs
-x = out_a + out_b
-
-# Outlined function:
-def main_incore_0(self, a, b, out):
-    a_tile = pl.load(a, [0], [64])
-    b_tile = pl.load(b, [0], [64])
-    c_tile = pl.add(a_tile, b_tile)
-    out_a = pl.store(c_tile, [0], out)
-    out_b = pl.mul(c_tile, 2.0)
-    return (out_a, out_b)
-```
-
-## Implementation
-
-**Header**: `include/pypto/ir/transforms/passes.h`
-
-```cpp
-Pass OutlineIncoreScopes();
-```
-
-**Implementation**: `src/ir/transforms/outline_incore_scopes.cpp`
-
-- Uses SSA analysis to determine inputs/outputs
-- Creates new Function nodes with InCore scope type
-- Replaces InCoreScopeStmt with Call + AssignStmt
-- Manages function naming and counters
-
-**Python binding**: `python/bindings/modules/passes.cpp`
-
-```cpp
-passes.def("outline_incore_scopes", &pass::OutlineIncoreScopes, "Outline InCore scopes");
-```
-
-**Tests**: `tests/ut/ir/transforms/test_outline_incore_scopes.py`
-
-- Tests basic scope outlining
-- Tests input/output analysis
-- Tests multiple scopes in same function
-- Tests nested scopes
-- Tests SSA preservation
-
-## Requirements
-
-**SSA form required**: The pass relies on SSA properties:
-
-- Single assignment ensures clear input/output analysis
-- No variable shadowing simplifies scope analysis
-- YieldStmt in control flow handled correctly
-
-**Run ConvertToSSA first** if IR is not in SSA form.
diff --git a/docs/en/dev/passes/09-convert_tensor_to_tile_ops.md b/docs/en/dev/passes/08-convert_tensor_to_tile_ops.md
similarity index 90%
rename from docs/en/dev/passes/09-convert_tensor_to_tile_ops.md
rename to docs/en/dev/passes/08-convert_tensor_to_tile_ops.md
index 1c069c3ca..243f575ed 100644
--- a/docs/en/dev/passes/09-convert_tensor_to_tile_ops.md
+++ b/docs/en/dev/passes/08-convert_tensor_to_tile_ops.md
@@ -4,14 +4,14 @@ Converts tensor operations to tile operations in InCore functions and updates or
 
 ## Overview
 
-After `OutlineIncoreScopes` extracts InCore scopes into separate functions, those functions still operate on `TensorType` variables using `tensor.*` operations. This pass lowers them to `TileType` variables with `tile.*` operations that map directly to PTO-ISA instructions.
+After `OutlineHierarchyScopes` and `OutlineIncoreScopes` extract `HierarchyScopeStmt` regions into separate functions (with `OutlineIncoreScopes` producing `Function(InCore)` for `CORE_GROUP` scopes), those InCore functions still operate on `TensorType` variables using `tensor.*` operations. This pass lowers them to `TileType` variables with `tile.*` operations that map directly to PTO-ISA instructions.
 
 The pass also updates call sites in orchestration/opaque functions: for each new output parameter added to an InCore function, a `tensor.create` is inserted at the call site.
 
 **Requirements**:
 
 - Input IR must be in SSA form
-- InCore scopes must be outlined (run `OutlineIncoreScopes` first)
+- Hierarchy scopes must be outlined into functions (run `OutlineHierarchyScopes` and `OutlineIncoreScopes` first)
 - Statement structure must be normalized
 
 **When to use**: Run after `OutlineClusterScopes` and before `OptimizeOrchTensors`.
@@ -119,7 +119,7 @@ Key changes:
 
 | Property | Value |
 | -------- | ----- |
-| Required | SSAForm, SplitIncoreOrch, NormalizedStmtStructure |
+| Required | SSAForm, HierarchyOutlined, NormalizedStmtStructure |
 | Produced | SSAForm, IncoreTileOps, NormalizedStmtStructure |
 | Invalidated | — |
 
diff --git a/docs/en/dev/passes/10-optimize_orch_tensors.md b/docs/en/dev/passes/09-optimize_orch_tensors.md
similarity index 98%
rename from docs/en/dev/passes/10-optimize_orch_tensors.md
rename to docs/en/dev/passes/09-optimize_orch_tensors.md
index b80645abc..616abf3d7 100644
--- a/docs/en/dev/passes/10-optimize_orch_tensors.md
+++ b/docs/en/dev/passes/09-optimize_orch_tensors.md
@@ -132,8 +132,8 @@ The `tensor.create` is eliminated; the iter-arg buffer is reused across iteratio
 
 | Property | Value |
 | -------- | ----- |
-| Required | SplitIncoreOrch, IncoreTileOps |
-| Produced | SplitIncoreOrch, IncoreTileOps |
+| Required | HierarchyOutlined, IncoreTileOps |
+| Produced | HierarchyOutlined, IncoreTileOps |
 | Invalidated | — |
 
 ## Key Components
diff --git a/docs/en/dev/passes/11-flatten_tile_nd_to_2d.md b/docs/en/dev/passes/10-flatten_tile_nd_to_2d.md
similarity index 100%
rename from docs/en/dev/passes/11-flatten_tile_nd_to_2d.md
rename to docs/en/dev/passes/10-flatten_tile_nd_to_2d.md
diff --git a/docs/en/dev/passes/14-expand_mixed_kernel.md b/docs/en/dev/passes/11-expand_mixed_kernel.md
similarity index 96%
rename from docs/en/dev/passes/14-expand_mixed_kernel.md
rename to docs/en/dev/passes/11-expand_mixed_kernel.md
index d5ce000c9..5f245b0b8 100644
--- a/docs/en/dev/passes/14-expand_mixed_kernel.md
+++ b/docs/en/dev/passes/11-expand_mixed_kernel.md
@@ -4,7 +4,7 @@ Expands mixed InCore functions into separate AIC (Cube) + AIV (Vector) kernels w
 
 ## Overview
 
-After `OutlineIncoreScopes` and `ConvertTensorToTileOps`, InCore functions may contain both Cube ops (`tile.matmul`, `tile.gemv`, etc.) and Vector ops (`tile.add`, `tile.exp`, etc.). Some ops like `tile.load`, `tile.store`, `tile.move`, and `tile.reshape` are classified as Cube or Vector based on the MemorySpace of their tile operands. Functions containing ops from both sides are **mixed InCore functions**. Hardware requires Cube and Vector operations to run on separate core types, so this pass splits them into:
+After `OutlineHierarchyScopes` and `ConvertTensorToTileOps`, InCore functions may contain both Cube ops (`tile.matmul`, `tile.gemv`, etc.) and Vector ops (`tile.add`, `tile.exp`, etc.). Some ops like `tile.load`, `tile.store`, `tile.move`, and `tile.reshape` are classified as Cube or Vector based on the MemorySpace of their tile operands. Functions containing ops from both sides are **mixed InCore functions**. Hardware requires Cube and Vector operations to run on separate core types, so this pass splits them into:
 
 - **AIC function** (`FunctionType::AIC`) — contains only Cube + shared ops
 - **AIV function** (`FunctionType::AIV`) — contains only Vector + shared ops
@@ -76,7 +76,7 @@ For consumer-side cross-core tiles, the pass also normalizes statement order to
 **Requirements**:
 
 - Input IR must have tile ops (run `ConvertTensorToTileOps` first)
-- Input IR must have InCore scopes outlined (run `OutlineIncoreScopes` first)
+- Input IR must have hierarchy scopes outlined into functions (run `OutlineHierarchyScopes` first)
 - Tile ops must be flattened to 2D (run `FlattenTileNdTo2D` first)
 - Tile memory space must be inferred (run `InferTileMemorySpace` first)
 - Cross-core fractal TileView assignment is supported on Ascend950 and Ascend910B backends
@@ -292,7 +292,7 @@ class After:
 
 | Property | Value |
 | -------- | ----- |
-| Required | SSAForm, IncoreTileOps, SplitIncoreOrch, TileOps2D, TileMemoryInferred |
+| Required | SSAForm, IncoreTileOps, HierarchyOutlined, TileOps2D, TileMemoryInferred |
 | Produced | SSAForm, MixedKernelExpanded |
 | Invalidated | — |
 
diff --git a/docs/en/dev/passes/15-init_memref.md b/docs/en/dev/passes/12-init_memref.md
similarity index 98%
rename from docs/en/dev/passes/15-init_memref.md
rename to docs/en/dev/passes/12-init_memref.md
index de54b5100..e37b8b381 100644
--- a/docs/en/dev/passes/15-init_memref.md
+++ b/docs/en/dev/passes/12-init_memref.md
@@ -12,7 +12,7 @@ This pass performs three tasks:
 
 Memory space is read from `TileType::memory_space_` (set by InferTileMemorySpace). Variables without `memory_space` default to DDR.
 
-**Requires**: SSAForm, SplitIncoreOrch, IncoreTileOps, TileOps2D, TileMemoryInferred.
+**Requires**: SSAForm, HierarchyOutlined, IncoreTileOps, TileOps2D, TileMemoryInferred.
 
 **Produces**: HasMemRefs, NormalizedStmtStructure.
 
diff --git a/docs/en/dev/passes/16-memory_reuse.md b/docs/en/dev/passes/13-memory_reuse.md
similarity index 100%
rename from docs/en/dev/passes/16-memory_reuse.md
rename to docs/en/dev/passes/13-memory_reuse.md
diff --git a/docs/en/dev/passes/17-allocate_memory_addr.md b/docs/en/dev/passes/14-allocate_memory_addr.md
similarity index 100%
rename from docs/en/dev/passes/17-allocate_memory_addr.md
rename to docs/en/dev/passes/14-allocate_memory_addr.md
diff --git a/docs/en/dev/passes/20-partial_unroll_tile_loops.md b/docs/en/dev/passes/15-partial_unroll_tile_loops.md
similarity index 97%
rename from docs/en/dev/passes/20-partial_unroll_tile_loops.md
rename to docs/en/dev/passes/15-partial_unroll_tile_loops.md
index 14f9d2786..b35ce7629 100644
--- a/docs/en/dev/passes/20-partial_unroll_tile_loops.md
+++ b/docs/en/dev/passes/15-partial_unroll_tile_loops.md
@@ -8,7 +8,7 @@ Lowers `pl.range(N, unroll=F)` at the tile level: replicates the loop body `F` t
 
 `PartialUnrollTileLoops` provides the targeted knob: replicate the body `F` times (typically 2–4) at the tile level, leaving an outer loop of `N/F` iterations. Each clone gets fresh def-vars (SSA preserved) and operates on independent tiles, which downstream `MemoryReuse` cannot merge.
 
-**Requires**: SSAForm, SplitIncoreOrch, IncoreTileOps, TileOps2D, TileMemoryInferred, NormalizedStmtStructure.
+**Requires**: SSAForm, HierarchyOutlined, IncoreTileOps, TileOps2D, TileMemoryInferred, NormalizedStmtStructure.
 
 **Pipeline position**: After `NormalizeReturnOrder`, before `InitMemRef` (slot 20.5). Late enough that all tile-structural decisions are made; early enough that `InitMemRef`/`MemoryReuse` see distinct tile vars per clone.
 
@@ -157,6 +157,6 @@ Every main-loop iteration AND every tail branch carries the `unroll_replicated`
 
 ## Related
 
-- [`ReorderUnrolledIO`](21-reorder_unrolled_io.md) — consumes the `unroll_replicated` marker
+- [`ReorderUnrolledIO`](16-reorder_unrolled_io.md) — consumes the `unroll_replicated` marker
 - [`UnrollLoops`](01-unroll_loops.md) — full-unroll pass at slot #1, kept as the primary `pl.unroll(N)` lowering
 - RFC #1025 — design document
diff --git a/docs/en/dev/passes/21-reorder_unrolled_io.md b/docs/en/dev/passes/16-reorder_unrolled_io.md
similarity index 95%
rename from docs/en/dev/passes/21-reorder_unrolled_io.md
rename to docs/en/dev/passes/16-reorder_unrolled_io.md
index 6b642e00c..4ade4e2d9 100644
--- a/docs/en/dev/passes/21-reorder_unrolled_io.md
+++ b/docs/en/dev/passes/16-reorder_unrolled_io.md
@@ -14,7 +14,7 @@ This pass reorders each marked `SeqStmts` so:
 
 The result is `[loads…, compute…, stores…]` whenever the dataflow allows. Sibling clones' input tiles are co-live near the top, output tiles co-live near the bottom — `MemoryReuse` cannot coalesce them, so each clone keeps its own MemRef and ping-pong buffering becomes possible.
 
-**Requires**: SSAForm, SplitIncoreOrch, IncoreTileOps, TileOps2D, TileMemoryInferred, NormalizedStmtStructure.
+**Requires**: SSAForm, HierarchyOutlined, IncoreTileOps, TileOps2D, TileMemoryInferred, NormalizedStmtStructure.
 
 **Pipeline position**: After `PartialUnrollTileLoops`, before `InitMemRef` (slot 20.6). Running before `InitMemRef` keeps SSAForm intact for the dependency analysis.
 
@@ -112,7 +112,7 @@ All four `tile_x_k` are now co-live up to the last load, and all four `tile_y_k`
 
 ## Related
 
-- [`PartialUnrollTileLoops`](20-partial_unroll_tile_loops.md) — produces the `unroll_replicated` marker this pass consumes
-- [`MemoryReuse`](16-memory_reuse.md) — runs after this pass; benefits from the co-live tiles
+- [`PartialUnrollTileLoops`](15-partial_unroll_tile_loops.md) — produces the `unroll_replicated` marker this pass consumes
+- [`MemoryReuse`](13-memory_reuse.md) — runs after this pass; benefits from the co-live tiles
 - RFC #1025 — design document
 - RFC #1026 / PR #1029 — InOut-use discipline + dependency analysis utility
diff --git a/docs/en/dev/passes/99-verifier.md b/docs/en/dev/passes/99-verifier.md
index 79c68238e..89a55b0b1 100644
--- a/docs/en/dev/passes/99-verifier.md
+++ b/docs/en/dev/passes/99-verifier.md
@@ -15,7 +15,7 @@ Extensible verification system for validating PyPTO IR correctness through plugg
 
 - **Pluggable Rule System**: Extend with custom verification rules
 - **Property-Based Verification**: Opt-in property sets — verify exactly what you need
-- **Structural Properties**: TypeChecked, BreakContinueValid, NoRedundantBlocks, UseAfterDef, OutParamNotShadowed, NoNestedInCore, and InOutUseValid are verified at pipeline start by `PassPipeline` and before/after each pass by `VerificationInstrument`
+- **Structural Properties**: TypeChecked, BreakContinueValid, NoRedundantBlocks, UseAfterDef, OutParamNotShadowed, and InOutUseValid are verified at pipeline start by `PassPipeline` and before/after each pass by `VerificationInstrument`
 - **Dual Verification Modes**: Collect diagnostics or throw on first error
 - **Pass Integration**: Use as a Pass in optimization pipelines
 - **Comprehensive Diagnostics**: Collect all issues with source locations
@@ -26,10 +26,10 @@ Extensible verification system for validating PyPTO IR correctness through plugg
 
 | Category | Examples | Behavior |
 | -------- | -------- | -------- |
-| **Structural** | TypeChecked, BreakContinueValid, NoRedundantBlocks, UseAfterDef, OutParamNotShadowed, NoNestedInCore, InOutUseValid | Always true. Verified at pipeline start and before/after each pass by `VerificationInstrument`. Never in PassProperties. |
+| **Structural** | TypeChecked, BreakContinueValid, NoRedundantBlocks, UseAfterDef, OutParamNotShadowed, InOutUseValid | Always true. Verified at pipeline start and before/after each pass by `VerificationInstrument`. Never in PassProperties. |
 | **Pipeline** | SSAForm, NoNestedCalls, HasMemRefs, ... | Produced/invalidated by passes. Verified per pass-declared contracts. |
 
-`GetStructuralProperties()` returns `{TypeChecked, BreakContinueValid, NoRedundantBlocks, UseAfterDef, OutParamNotShadowed, NoNestedInCore, InOutUseValid}`. These are verified **at pipeline start** by `PassPipeline::Run()` and **before/after each pass** by `VerificationInstrument`. Since no pass declares them in `required`/`produced`/`invalidated`, `VerificationInstrument` unions them with the pass's declared properties to ensure no pass breaks these fundamental invariants.
+`GetStructuralProperties()` returns `{TypeChecked, BreakContinueValid, NoRedundantBlocks, UseAfterDef, OutParamNotShadowed, InOutUseValid}`. These are verified **at pipeline start** by `PassPipeline::Run()` and **before/after each pass** by `VerificationInstrument`. Since no pass declares them in `required`/`produced`/`invalidated`, `VerificationInstrument` unions them with the pass's declared properties to ensure no pass breaks these fundamental invariants.
 
 ### Verification Rule System
 
@@ -68,12 +68,11 @@ The `run_verifier()` utility creates a standalone `Pass` for ad-hoc use in custo
 | **UseAfterDefCheck** | UseAfterDef | Every Var use dominated by a definition (param, AssignStmt, loop var, iter_arg, return_var) |
 | **NormalizedStmtStructure** | NormalizedStmtStructure | Nested `SeqStmts` flattened and single-child `SeqStmts` unwrapped |
 | **NoRedundantBlocks** | NoRedundantBlocks | No single-child or nested `SeqStmts` |
-| **SplitIncoreOrch** | SplitIncoreOrch | No `InCoreScopeStmt` nodes remain in Opaque functions |
+| **HierarchyOutlined** | HierarchyOutlined | No `HierarchyScopeStmt` nodes remain in `Opaque` or `Orchestration` functions |
 | **IncoreTileOps** | IncoreTileOps | InCore functions use tile ops (no tensor-level ops remain) |
 | **HasMemRefs** | HasMemRefs | All TileType variables have MemRef initialized |
 | **AllocatedMemoryAddr** | AllocatedMemoryAddr | All MemRefs have valid addresses within buffer limits |
 | **OutParamNotShadowed** | OutParamNotShadowed | Out/InOut params not reassigned with tensor-creating ops |
-| **NoNestedInCore** | NoNestedInCore | No nested InCore scopes (`InCoreScopeStmt` inside `InCoreScopeStmt`) |
 | **InOutUseValid** | InOutUseValid | Variables passed as InOut/Out to user-function calls are not read after the call (RFC #1026). Group-typed function bodies are skipped pending follow-up. |
 
 ### SSAVerify
@@ -161,8 +160,8 @@ Singleton registry mapping `IRProperty` values to `PropertyVerifier` factories.
 
 | Function | Returns | Description |
 | -------- | ------- | ----------- |
-| `GetStructuralProperties()` | `{TypeChecked, BreakContinueValid, NoRedundantBlocks, UseAfterDef, OutParamNotShadowed, NoNestedInCore, InOutUseValid}` | Invariants verified at pipeline start and before/after each pass |
-| `GetDefaultVerifyProperties()` | `{SSAForm, TypeChecked, NoNestedCalls, BreakContinueValid, NoRedundantBlocks, UseAfterDef, OutParamNotShadowed, NoNestedInCore}` | Default set for `run_verifier()` |
+| `GetStructuralProperties()` | `{TypeChecked, BreakContinueValid, NoRedundantBlocks, UseAfterDef, OutParamNotShadowed, InOutUseValid}` | Invariants verified at pipeline start and before/after each pass |
+| `GetDefaultVerifyProperties()` | `{SSAForm, TypeChecked, NoNestedCalls, BreakContinueValid, NoRedundantBlocks, UseAfterDef, OutParamNotShadowed}` | Default set for `run_verifier()` |
 | `GetVerifiedProperties()` | `{SSAForm, TypeChecked, AllocatedMemoryAddr, BreakContinueValid, NoRedundantBlocks, InOutUseValid}` | Lightweight set for `PassPipeline` auto-verify |
 
 ### RunVerifier Pass Factory
diff --git a/docs/en/user/01-language_guide.md b/docs/en/user/01-language_guide.md
index 6ce6b77ba..acc68f7b4 100644
--- a/docs/en/user/01-language_guide.md
+++ b/docs/en/user/01-language_guide.md
@@ -410,50 +410,23 @@ class Model:
 Mark a code region as InCore execution without making a separate function:
 
 ```python
-# Preferred (new API):
 with pl.at(level=pl.Level.CORE_GROUP):
     y: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
-
-# Deprecated (use pl.at instead):
-with pl.incore():
-    y: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
 ```
 
-For compiler-driven chunked loop outlining (AutoInCore), pass `pl.auto_chunk` in
-the `optimizations` list:
+`OutlineIncoreScopes` later extracts this region into a
+`Function(InCore)` and re-types the parent `Opaque` function as
+`Orchestration`. (Non-CORE_GROUP `pl.at(level=...)` regions are extracted
+by the preceding `OutlineHierarchyScopes` pass into `Function(Opaque)`,
+without parent promotion.)
 
-```python
-# Preferred (new API):
-with pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.auto_chunk]):
-    for i in pl.parallel(0, 8, 1, chunk=4):
-        x = pl.add(x, x)
-
-# Deprecated (still works, emits DeprecationWarning):
-with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-    ...
-
-with pl.auto_incore():
-    ...
-```
-
-To set a cross-core split mode (consumed by the `ExpandMixedKernel` pass), use
-`pl.split(...)` — independent from `pl.auto_chunk`, so the two can be combined:
+To set a cross-core split mode (consumed by the `ExpandMixedKernel` pass),
+pass `pl.split(...)` in `optimizations`:
 
 ```python
-# Plain InCore + split hint:
 with pl.at(level=pl.Level.CORE_GROUP,
            optimizations=[pl.split(pl.SplitMode.UP_DOWN)]):
     y: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
-
-# AutoInCore + split hint (independent entries, combined freely):
-with pl.at(level=pl.Level.CORE_GROUP,
-           optimizations=[pl.auto_chunk, pl.split(pl.SplitMode.UP_DOWN)]):
-    for i in pl.parallel(0, 8, 1, chunk=4):
-        x = pl.add(x, x)
-
-# Deprecated single-kwarg form (still works, emits DeprecationWarning):
-with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN):
-    ...
 ```
 
 ## Memory and Data Movement
@@ -541,22 +514,28 @@ The `Default` strategy runs these passes in order:
 1. **UnrollLoops** — unroll loop iterations
 2. **CtrlFlowTransform** — rewrite control flow to structured IR
 3. **ConvertToSSA** — convert to static single assignment form
-4. **FlattenCallExpr** — flatten nested function calls
-5. **SplitChunkedLoops** — split chunked loops into separate loops
-6. **InterchangeChunkLoops** — interchange chunk loop ordering
-7. **OutlineHierarchyScopes** — outline hierarchy scopes
-8. **OutlineIncoreScopes** — outline InCore scopes into separate functions
-9. **OutlineClusterScopes** — outline cluster scopes
-10. **ConvertTensorToTileOps** — convert tensor operations to tile operations
+4. **NormalizeStmtStructure** — flatten/unwrap redundant `SeqStmts`
+5. **FlattenCallExpr** — flatten nested function calls
+6. **OutlineHierarchyScopes** — outline non-CORE_GROUP `HierarchyScopeStmt` regions into `Function(Opaque)`
+7. **OutlineIncoreScopes** — outline CORE_GROUP `HierarchyScopeStmt` regions into `Function(InCore)`; promote parent to `Orchestration`
+8. **OutlineClusterScopes** — outline cluster scopes into Group functions
+9. **ConvertTensorToTileOps** — convert tensor operations to tile operations
+10. **OptimizeOrchTensors** — optimize orchestration-level tensor ops
 11. **FlattenTileNdTo2D** — normalize ND tile ops to 2D
 12. **InferTileMemorySpace** — infer tile memory spaces
 13. **ResolveTransposeLayout** — repair transpose layout handling
 14. **ResolveBackendOpLayouts** — repair backend-constrained tile layouts
 15. **ExpandMixedKernel** — split mixed kernels when needed
-16. **InitMemRef** — assign memory spaces and insert buffer allocations
-17. **MemoryReuse** — share buffers with non-overlapping lifetimes
-18. **LegalizePTOBufferReuse** — legalize PTO buffer reuse patterns
-19. **AllocateMemoryAddr** — assign concrete memory addresses
+16. **SplitVectorKernel** — split vector kernels when needed
+17. **NormalizeReturnOrder** — reorder returns to match Out/InOut params
+18. **PartialUnrollTileLoops** — partially unroll tile-level loops
+19. **ReorderUnrolledIO** — group loads/stores of unrolled clones
+20. **InitMemRef** — assign memory spaces and insert buffer allocations
+21. **MemoryReuse** — share buffers with non-overlapping lifetimes
+22. **LegalizePTOBufferReuse** — legalize PTO buffer reuse patterns
+23. **AllocateMemoryAddr** — assign concrete memory addresses
+24. **FuseCreateAssembleToSlice** — fuse create + assemble ops
+25. **Simplify** — final simplification pass
 
 ### Debugging
 
diff --git a/docs/en/user/02-operation_reference.md b/docs/en/user/02-operation_reference.md
index 327e153b1..0fad072c9 100644
--- a/docs/en/user/02-operation_reference.md
+++ b/docs/en/user/02-operation_reference.md
@@ -215,6 +215,8 @@ Compare types: `EQ=0, NE=1, LT=2, LE=3, GT=4, GE=5`
 | `yield_` | `(*values: Any) -> Any \| tuple[Any, ...]` | Yield values from for/if scope |
 | `cond` | `(condition: bool \| Scalar) -> None` | Set while-loop condition (must be first statement) |
 | `const` | `(value: int \| float, dtype: DataType) -> int \| float` | Typed constant |
-| `incore` | `() -> IncoreContext` | Context manager for InCore scope |
+| `at` | `(*, level: Level, role: Role \| None = None, optimizations: Sequence[Optimization] \| None = None) -> AtContext` | Context manager for a hierarchy scope; `level=Level.CORE_GROUP` is the InCore form |
+| `cluster` | `() -> ClusterContext` | Context manager for a cluster (AIC+AIV) scope |
+| `spmd` | `(*, core_num: int \| Scalar, sync_start: bool = False) -> SpmdContext` | Context manager for a standalone SPMD launch scope |
 | `dynamic` | `(name: str) -> DynVar` | Create dynamic dimension variable |
 | `create_tensor` | `(shape: Sequence[IntLike], dtype: DataType, layout: TensorLayout = None) -> Tensor` | Create tensor (promoted from `pl.tensor`) |
diff --git a/docs/zh-cn/dev/ir/01-hierarchy.md b/docs/zh-cn/dev/ir/01-hierarchy.md
index 218c7969b..b8068b3b1 100644
--- a/docs/zh-cn/dev/ir/01-hierarchy.md
+++ b/docs/zh-cn/dev/ir/01-hierarchy.md
@@ -32,7 +32,12 @@
 <return_stmt> ::= "return" [ <var_list> ]
 <eval_stmt>  ::= <expr>
 <seq_stmts>  ::= <stmt> { ";" <stmt> }
-<scope_stmt> ::= "with" "pl.incore" "(" ")" ":" <stmt_list>
+<scope_stmt> ::= "with" "pl.at" "(" "level" "=" <level> [ "," "role" "=" <role> ]
+                 [ "," "optimizations" "=" "[" <optimization_list> "]" ] ")"
+                 ":" <stmt_list>
+               | "with" "pl.cluster" "(" ")" ":" <stmt_list>
+               | "with" "pl.spmd" "(" "core_num" "=" <expr>
+                 [ "," "sync_start" "=" <expr> ] ")" ":" <stmt_list>
 <break_stmt> ::= "break"
 <continue_stmt> ::= "continue"
 
@@ -152,10 +157,8 @@ for_stmt = ir.ForStmt(i, start, stop, step, [sum_iter], body, [sum_final], span)
 | **IfStmt** | `condition_`, `then_stmts_`, `else_stmts_`, `return_vars_` | 条件分支 |
 | **ForStmt** | `loop_var_` (DefField), `start_`, `stop_`, `step_`, `iter_args_` (DefField), `body_`, `return_vars_` (DefField), `kind_` | 带可选迭代参数的 for 循环 |
 | **WhileStmt** | `condition_`, `iter_args_` (DefField), `body_`, `return_vars_` (DefField) | 带条件和迭代参数的 while 循环 |
-| **InCoreScopeStmt** | `name_hint_`, `body_`, `split_`（可选） | InCore 区域；由 `OutlineIncoreScopes` 提取为 `Function(InCore)` |
-| **AutoInCoreScopeStmt** | `name_hint_`, `body_`, `split_`（可选） | Auto-InCore 区域；由 `InterchangeChunkLoops` 消费 |
 | **ClusterScopeStmt** | `name_hint_`, `body_` | Cluster 区域；由 `OutlineClusterScopes` 提取为 `Function(Group)` |
-| **HierarchyScopeStmt** | `name_hint_`, `body_`, `level_`, `role_`（可选） | 给定 Level/Role 的流水线阶段区域 |
+| **HierarchyScopeStmt** | `name_hint_`, `body_`, `level_`, `role_`（可选）, `split_`（可选） | 给定 Level/Role 的流水线阶段区域；当 `level_ == CORE_GROUP` 时提取为 `Function(InCore)`，其他层级提取为 `Function(Opaque)` |
 | **SpmdScopeStmt** | `name_hint_`, `body_`, `core_num_`, `sync_start_` | SPMD 启动区域；提取为 `Function(Spmd)` |
 | **YieldStmt** | `values_` | 在循环迭代中产出值 |
 | **EvalStmt** | `expr_` | 为副作用求值表达式 |
@@ -220,24 +223,19 @@ while_stmt = ir.WhileStmt(condition, [x_iter], body, [x_final], span)
 
 ### ScopeStmt 详细说明
 
-`ScopeStmt` 是一个**抽象基类**，用于标记具有特定执行上下文的区域。下列五个具体子类
+`ScopeStmt` 是一个**抽象基类**，用于标记具有特定执行上下文的区域。下列三个具体子类
 各自只携带其类型有效的字段——非法组合在构造时即不可表达。在 `ScopeStmt` 类型的引用上，
 可使用 `s.scope_kind`（C++ 中为 `s.GetScopeKind()`）来取回类型，或使用
-`isinstance(s, InCoreScopeStmt)` 在具体类型上分派。
+`isinstance(s, HierarchyScopeStmt)` 在具体类型上分派。
 
-五个子类共享公共基类字段 `name_hint_: str` 和 `body_: StmtPtr`。注意：
-`pl.at(level=Level.CORE_GROUP)` 实际下沉到 `InCoreScopeStmt` /
-`AutoInCoreScopeStmt`，而非 `HierarchyScopeStmt`——解析器会在 `CORE_GROUP`
-拒绝 `role=`。`HierarchyScopeStmt` 仅用于非 `CORE_GROUP` 的层级
-（host、cluster、global），并不是 in-core 作用域的通用替代。
+三个子类共享公共基类字段 `name_hint_: str` 和 `body_: StmtPtr`。`pl.at(level=...)`
+统一下沉到 `HierarchyScopeStmt`——包括 `level=Level.CORE_GROUP`，它会产生
+`level_ == CORE_GROUP` 且可选携带 `split_` 的 `HierarchyScopeStmt`。`OutlineIncoreScopes`
+随后把该 `CORE_GROUP` 作用域提取为 `Function(InCore)`，并将其父 `Opaque`
+函数升级为 `Orchestration`。非 `CORE_GROUP` 的 `HierarchyScopeStmt` 则由
+紧邻其前执行的 `OutlineHierarchyScopes` 提取为 `Function(Opaque)`。
 
 ```python
-# with pl.incore(): y = pl.add(x, x)
-in_core = ir.InCoreScopeStmt(name_hint="", body=body, span=span)
-
-# with pl.auto_incore():       (split 可选)
-auto = ir.AutoInCoreScopeStmt(name_hint="", body=body, span=span)
-
 # with pl.cluster():
 cluster = ir.ClusterScopeStmt(name_hint="", body=body, span=span)
 
@@ -245,6 +243,12 @@ cluster = ir.ClusterScopeStmt(name_hint="", body=body, span=span)
 hier = ir.HierarchyScopeStmt(level=ir.Level.HOST, role=ir.Role.Worker,
                              name_hint="", body=body, span=span)
 
+# with pl.at(level=Level.CORE_GROUP,
+#            optimizations=[pl.split(pl.SplitMode.UP_DOWN)]):
+hier_core = ir.HierarchyScopeStmt(level=ir.Level.CORE_GROUP,
+                                  split=ir.SplitMode.UP_DOWN,
+                                  name_hint="", body=body, span=span)
+
 # with pl.spmd(core_num=8):
 spmd = ir.SpmdScopeStmt(core_num=8, sync_start=False,
                         name_hint="", body=body, span=span)
@@ -256,20 +260,31 @@ spmd = ir.SpmdScopeStmt(core_num=8, sync_start=False,
   （执行一次，线性执行）。
 - 必填字段在构造时强制校验：`HierarchyScopeStmt.level_` 不可为空；
   `SpmdScopeStmt` 拒绝 `core_num <= 0`。
-- `InCoreScopeStmt` / `AutoInCoreScopeStmt` 已计划弃用；新代码应优先使用
-  `HierarchyScopeStmt` 或其它将保留的子类。
+- `HierarchyScopeStmt.split_` 可选，且仅在 `Level.CORE_GROUP` 下有意义。
+  它会被复制到提取出的 `InCore` 函数 attrs 上，供 `ExpandMixedKernel` 读取。
 - Pass 行为：
-  - `InterchangeChunkLoops` 消费 `AutoInCoreScopeStmt`
-  - `OutlineIncoreScopes` 将 `InCoreScopeStmt` 提取为 `Function(InCore)`
+  - `OutlineHierarchyScopes` 将每个非 `CORE_GROUP` 的 `HierarchyScopeStmt`
+    提取为一个独立的 `FunctionType::Opaque` 函数，父函数类型保持不变。
+  - `OutlineIncoreScopes`（紧随其后执行）将每个 `CORE_GROUP`
+    `HierarchyScopeStmt` 提取为一个独立的 `FunctionType::InCore` 函数。
+    包含至少一个 `CORE_GROUP` 作用域的父函数由 `Opaque` 升级为
+    `Orchestration`。
   - `OutlineClusterScopes` 将 `ClusterScopeStmt` 提取为 `Function(Group)`，
-    将独立的 `SpmdScopeStmt` 提取为 `Function(Spmd)`
-  - `OutlineHierarchyScopes` 提取 `HierarchyScopeStmt`
+    将独立的 `SpmdScopeStmt` 提取为 `Function(Spmd)`。
 
 **变换示例：**
 
 ```python
-# Before: with pl.incore(): y = pl.add(x, x); return y
-# After: main_incore_0(x) -> y; main(x): y = main_incore_0(x); return y
+# Before:
+# def main(x):
+#     with pl.at(level=pl.Level.CORE_GROUP):
+#         y = pl.add(x, x)
+#     return y
+# After:
+# def main_core_group_0(x) -> y: ...         # FunctionType.InCore
+# def main(x) -> y:                           # FunctionType.Orchestration
+#     y = main_core_group_0(x)
+#     return y
 ```
 
 **并行 for 循环 (ForKind)：**
@@ -411,7 +426,7 @@ add_func = program.get_function("add")  # Access by name
 | **一元运算** | 5 | Abs, Neg, Not, BitNot, Cast |
 | **调用/访问** | 2 | Call, TupleGetItemExpr |
 | **操作** | 2 | Op, GlobalVar |
-| **语句** | 15 | AssignStmt, IfStmt, ForStmt, WhileStmt, ReturnStmt, InCoreScopeStmt, AutoInCoreScopeStmt, ClusterScopeStmt, HierarchyScopeStmt, SpmdScopeStmt, YieldStmt, EvalStmt, SeqStmts, BreakStmt, ContinueStmt |
+| **语句** | 13 | AssignStmt, IfStmt, ForStmt, WhileStmt, ReturnStmt, ClusterScopeStmt, HierarchyScopeStmt, SpmdScopeStmt, YieldStmt, EvalStmt, SeqStmts, BreakStmt, ContinueStmt |
 | **类型** | 6 | ScalarType, TensorType, TileType, TupleType, PipeType, UnknownType |
 | **函数** | 2 | Function, Program |
 
diff --git a/docs/zh-cn/dev/language/00-python_syntax.md b/docs/zh-cn/dev/language/00-python_syntax.md
index ae3e7d684..5987676b0 100644
--- a/docs/zh-cn/dev/language/00-python_syntax.md
+++ b/docs/zh-cn/dev/language/00-python_syntax.md
@@ -255,22 +255,17 @@ for i in pl.unroll(12, chunk=4):
     body_statements
 ```
 
-**要点:** `chunk=C` 将循环拆分为外层顺序循环和 `C` 次迭代的内层循环。内层循环保留原始类型 (Sequential/Parallel/Unroll)。`chunk` 不能与 `init_values` 一起使用，且 `chunk=` 循环只能出现在 `with pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.auto_chunk]):` 内；在该作用域外，parser 会直接报错。参见 [SplitChunkedLoops Pass](../passes/05-split_chunked_loops.md)。
+**要点:** `chunk=C` 将循环拆分为外层顺序循环和 `C` 次迭代的内层循环。内层循环保留原始类型 (Sequential/Parallel/Unroll)。`chunk` 不能与 `init_values` 一起使用。
 
 ### 作用域上下文管理器 (Scope Context Managers)
 
-| 形式 | Scope 类型 | 说明 |
-| ---- | ---------- | ---- |
-| `pl.at(level=pl.Level.CORE_GROUP)` | `InCore` | CORE_GROUP 级固定边界 outline |
-| `pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.split(MODE)])` | `InCore` | InCore + 跨核 split 提示 |
-| `pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.auto_chunk])` | `AutoInCore` | 编译器驱动的 chunked 循环 split |
-| `pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.auto_chunk, pl.split(MODE)])` | `AutoInCore` | AutoInCore + split 提示（条目独立） |
-| `pl.at(level=pl.Level.HOST)`（或任意非 `CORE_GROUP` 级别） | `Hierarchy` | 分布式层级作用域 |
-| `pl.cluster()` | `Cluster` | AIC+AIV 协同调度组 |
-| `pl.incore()` *(已弃用)* | `InCore` | 请改用 `pl.at(level=pl.Level.CORE_GROUP)` |
-| `pl.auto_incore(split=...)` *(已弃用)* | `AutoInCore` | 请改用 `pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.auto_chunk, pl.split(...)])` |
-| `pl.at(..., optimization=pl.chunked_loop_optimizer[(split=...)])` *(已弃用)* | `AutoInCore` | 请改用 `pl.at(..., optimizations=[pl.auto_chunk, pl.split(...)])` |
-| `pl.at(..., split=...)` *(已弃用)* | `InCore` | 请改用 `pl.at(..., optimizations=[pl.split(...)])` |
+| 形式 | 产生 | 说明 |
+| ---- | ---- | ---- |
+| `pl.at(level=pl.Level.CORE_GROUP)` | `HierarchyScopeStmt`（level=CORE_GROUP） | 由 `OutlineHierarchyScopes` 提取为 `Function(InCore)`；其父 `Opaque` 函数升级为 `Orchestration` |
+| `pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.split(MODE)])` | `HierarchyScopeStmt`（level=CORE_GROUP, split=MODE） | 同上；split 提示随提取函数保留，由 `ExpandMixedKernel` 消费 |
+| `pl.at(level=pl.Level.HOST)`（或任意非 `CORE_GROUP` 层级） | `HierarchyScopeStmt`（level=HOST/...） | 提取为 `Function(Opaque)`；父函数类型保持不变 |
+| `pl.cluster()` | `ClusterScopeStmt` | 由 `OutlineClusterScopes` 提取为 `Function(Group)` |
+| `pl.spmd(core_num=N[, sync_start=...])` | `SpmdScopeStmt` | 非 cluster 内的 standalone spmd 提取为 `Function(Spmd)`；若位于 cluster 内，属性会被合并到 Group 函数上 |
 
 示例参见 [语言指南](../../user/01-language_guide.md#incore-作用域)。
 
diff --git a/docs/zh-cn/dev/passes/00-pass_manager.md b/docs/zh-cn/dev/passes/00-pass_manager.md
index 26c2b2c7a..8ebee8d2a 100644
--- a/docs/zh-cn/dev/passes/00-pass_manager.md
+++ b/docs/zh-cn/dev/passes/00-pass_manager.md
@@ -33,7 +33,7 @@
 | `NoNestedCalls` | 无嵌套调用表达式 (Expression) |
 | `NormalizedStmtStructure` | 语句 (Statement) 结构已规范化 |
 | `NoRedundantBlocks` | 无单子节点或嵌套的 SeqStmts |
-| `SplitIncoreOrch` | InCore 作用域已提取为独立函数 |
+| `HierarchyOutlined` | `HierarchyScopeStmt` 区域已提取为函数（非 CORE_GROUP 由 `OutlineHierarchyScopes` 提取为 `Opaque`；`CORE_GROUP` 由 `OutlineIncoreScopes` 提取为 `InCore`）；当存在 `CORE_GROUP` 作用域被提取时，父函数由 `Opaque` 升级为 `Orchestration`。由 `OutlineIncoreScopes`（两个 outline Pass 中的后者）产生。 |
 | `ClusterOutlined` | Cluster 作用域已提取为 Group 函数 |
 | `HasMemRefs` | 变量上已初始化内存引用 (MemRef) 对象 |
 | `IncoreTileOps` | InCore 函数使用 tile 操作 |
@@ -61,21 +61,20 @@ struct PassProperties {
 | UnrollLoops | TypeChecked | TypeChecked | — |
 | CtrlFlowTransform | TypeChecked | TypeChecked, StructuredCtrlFlow | — |
 | ConvertToSSA | TypeChecked | TypeChecked, SSAForm | NormalizedStmtStructure |
-| FlattenCallExpr | SSAForm | SSAForm, NoNestedCalls | NormalizedStmtStructure |
-| SplitChunkedLoops | TypeChecked, SSAForm | TypeChecked, SSAForm | — |
-| InterchangeChunkLoops | TypeChecked, SSAForm | TypeChecked, SSAForm | — |
 | NormalizeStmtStructure | TypeChecked | TypeChecked, NormalizedStmtStructure | — |
-| OutlineIncoreScopes | TypeChecked, SSAForm | SplitIncoreOrch | — |
+| FlattenCallExpr | SSAForm | SSAForm, NoNestedCalls | NormalizedStmtStructure |
+| OutlineHierarchyScopes | SSAForm | SSAForm | — |
+| OutlineIncoreScopes | SSAForm | SSAForm, HierarchyOutlined | — |
 | OutlineClusterScopes | TypeChecked, SSAForm | ClusterOutlined | — |
-| ConvertTensorToTileOps | SplitIncoreOrch | IncoreTileOps | — |
+| ConvertTensorToTileOps | HierarchyOutlined | IncoreTileOps | — |
 | FlattenTileNdTo2D | SSAForm, IncoreTileOps | SSAForm, TileOps2D | — |
-| ResolveBackendOpLayouts | SSAForm, IncoreTileOps, SplitIncoreOrch, TileOps2D | SSAForm, IncoreTileOps, SplitIncoreOrch, TileOps2D | NormalizedStmtStructure |
-| ExpandMixedKernel | SSAForm, IncoreTileOps, SplitIncoreOrch, TileOps2D | SSAForm, MixedKernelExpanded | — |
-| NormalizeReturnOrder | SplitIncoreOrch, IncoreTileOps | — | — |
-| InitMemRef | TypeChecked, SSAForm, SplitIncoreOrch, IncoreTileOps, TileOps2D | HasMemRefs | SSAForm |
-| MemoryReuse | TypeChecked, SplitIncoreOrch, IncoreTileOps, HasMemRefs, TileOps2D | — | — |
-| InsertSync | TypeChecked, SplitIncoreOrch, IncoreTileOps, HasMemRefs, TileOps2D | — | — |
-| AllocateMemoryAddr | TypeChecked, SplitIncoreOrch, IncoreTileOps, HasMemRefs, TileOps2D | AllocatedMemoryAddr | — |
+| ResolveBackendOpLayouts | SSAForm, IncoreTileOps, HierarchyOutlined, TileOps2D | SSAForm, IncoreTileOps, HierarchyOutlined, TileOps2D | NormalizedStmtStructure |
+| ExpandMixedKernel | SSAForm, IncoreTileOps, HierarchyOutlined, TileOps2D | SSAForm, MixedKernelExpanded | — |
+| NormalizeReturnOrder | HierarchyOutlined, IncoreTileOps | — | — |
+| InitMemRef | TypeChecked, SSAForm, HierarchyOutlined, IncoreTileOps, TileOps2D | HasMemRefs | SSAForm |
+| MemoryReuse | TypeChecked, HierarchyOutlined, IncoreTileOps, HasMemRefs, TileOps2D | — | — |
+| InsertSync | TypeChecked, HierarchyOutlined, IncoreTileOps, HasMemRefs, TileOps2D | — | — |
+| AllocateMemoryAddr | TypeChecked, HierarchyOutlined, IncoreTileOps, HasMemRefs, TileOps2D | AllocatedMemoryAddr | — |
 | FuseCreateAssembleToSlice | — | — | — |
 | Simplify | — | — | — |
 
diff --git a/docs/zh-cn/dev/passes/01-unroll_loops.md b/docs/zh-cn/dev/passes/01-unroll_loops.md
index 5dd2217d4..57ddec901 100644
--- a/docs/zh-cn/dev/passes/01-unroll_loops.md
+++ b/docs/zh-cn/dev/passes/01-unroll_loops.md
@@ -77,10 +77,10 @@ class After:
 UnrollLoops 在 `Default` 和 `DebugTileOptimization` 中都只**运行一次**，位于控制流结构化之前：
 
 ```text
-UnrollLoops → CtrlFlowTransform → ConvertToSSA → FlattenCallExpr → SplitChunkedLoops → InterchangeChunkLoops → OutlineIncoreScopes → ...
+UnrollLoops → CtrlFlowTransform → ConvertToSSA → NormalizeStmtStructure → FlattenCallExpr → OutlineHierarchyScopes → OutlineIncoreScopes → OutlineClusterScopes → ...
 ```
 
-UnrollLoops 展开非分块的 `pl.unroll()` 循环（跳过分块展开循环，保留 `chunk` 供后续 `SplitChunkedLoops` 处理）。
+UnrollLoops 将 `pl.unroll()` 循环展开为其内联复制。
 
 ## Pass 属性
 
diff --git a/docs/zh-cn/dev/passes/02-ctrl_flow_transform.md b/docs/zh-cn/dev/passes/02-ctrl_flow_transform.md
index 690c97362..194c35af8 100644
--- a/docs/zh-cn/dev/passes/02-ctrl_flow_transform.md
+++ b/docs/zh-cn/dev/passes/02-ctrl_flow_transform.md
@@ -163,7 +163,7 @@ while i < n and not __break_0:
 CtrlFlowTransform 在 UnrollLoops 之后、ConvertToSSA 之前运行：
 
 ```text
-UnrollLoops -> CtrlFlowTransform -> ConvertToSSA -> FlattenCallExpr -> SplitChunkedLoops -> ...
+UnrollLoops -> CtrlFlowTransform -> ConvertToSSA -> NormalizeStmtStructure -> FlattenCallExpr -> OutlineHierarchyScopes -> ...
 ```
 
 ## Pass 属性
diff --git a/docs/zh-cn/dev/passes/03-convert_to_ssa.md b/docs/zh-cn/dev/passes/03-convert_to_ssa.md
index cd1a95c36..84123415d 100644
--- a/docs/zh-cn/dev/passes/03-convert_to_ssa.md
+++ b/docs/zh-cn/dev/passes/03-convert_to_ssa.md
@@ -13,7 +13,7 @@
 
 **需要**：TypeChecked 属性 (Property)（需在运行本 Pass 之前已建立，可通过属性验证/`VerificationInstrument` 等机制检查）。
 
-**使用时机**：在任何需要 SSA 形式的优化或分析之前运行此 Pass（如 OutlineIncoreScopes、内存优化 Pass）。
+**使用时机**：在任何需要 SSA 形式的优化或分析之前运行此 Pass（如 OutlineHierarchyScopes、内存优化 Pass）。
 
 ## API
 
diff --git a/docs/zh-cn/dev/passes/05-outline_hierarchy_scopes.md b/docs/zh-cn/dev/passes/05-outline_hierarchy_scopes.md
new file mode 100644
index 000000000..f42dbad27
--- /dev/null
+++ b/docs/zh-cn/dev/passes/05-outline_hierarchy_scopes.md
@@ -0,0 +1,191 @@
+# OutlineHierarchyScopes Pass
+
+将非 `CORE_GROUP` 的 `HierarchyScopeStmt` 区域提取为独立的 `Opaque` 函数，
+并把作用域的 level/role 元信息带到提取出的函数上。
+
+## 概述
+
+该 Pass 把每个 `level_` 不为 `Level.CORE_GROUP` 的 `HierarchyScopeStmt`
+变换为独立的 `Function` 定义，并将原作用域替换为对该函数的 `Call`。提取出
+的函数类型恒为 `FunctionType::Opaque`；父函数的类型保持不变。
+
+| 作用域 `level_` | 本 Pass 是否处理 | 提取出的函数类型 | 父函数类型（Pass 后） |
+| --------------- | ---------------- | ---------------- | --------------------- |
+| `Level.HOST`、`Level.CLUSTER`、`Level.GLOBAL`、... | 是 | `FunctionType::Opaque` | 保持不变 |
+| `Level.CORE_GROUP` | **否 —— 有意跳过** | *（由 [`OutlineIncoreScopes`](06-outline_incore_scopes.md) 处理）* | *（由下一个 Pass 提升为 `Orchestration`）* |
+
+`CORE_GROUP` 作用域在本 Pass 中被有意保留；紧接着执行的
+[`OutlineIncoreScopes`](06-outline_incore_scopes.md) 会把它们提取为
+`Function(InCore)` 并将父函数由 `Opaque` 提升为 `Orchestration`。
+
+**前置条件**：
+
+- 输入 IR 必须为 SSA 形式（需先运行 `ConvertToSSA`）。本 Pass 保留
+  （产生）SSA 形式。
+- 处理 `Opaque` 函数。已经为 `Orchestration`、`InCore`、`AIC`、`AIV`、
+  `Group` 的函数保持不变。
+
+**使用时机**：在 `ConvertToSSA`/`FlattenCallExpr` 之后运行，当 IR 中包含
+非 `CORE_GROUP` 层级的 `with pl.at(level=...):` 作用域需要提取为独立辅助
+函数时使用。
+
+## API
+
+| C++ | Python | 级别 |
+| --- | ------ | ---- |
+| `pass::OutlineHierarchyScopes()` | `passes.outline_hierarchy_scopes()` | 程序级 |
+
+**工厂函数**：
+
+```cpp
+Pass OutlineHierarchyScopes();
+```
+
+**Python 用法**：
+
+```python
+from pypto.pypto_core import passes
+
+outline_pass = passes.outline_hierarchy_scopes()
+program_outlined = outline_pass(program)
+```
+
+## 算法
+
+1. **扫描 Hierarchy 作用域**：在每个 `Opaque` 函数体中查找所有 `level_`
+   **不为** `CORE_GROUP` 的 `HierarchyScopeStmt` 节点。
+2. **分析输入/输出**：复用 scope_outline_utils 辅助工具计算外部定义、内部
+   使用的变量（输入）以及内部定义、外部使用的变量（输出）。
+3. **创建提取函数**：将作用域体提取为新的 `Function`：
+   - 参数 = 输入变量
+   - 返回值 = 输出变量
+   - 函数体 = 作用域体
+   - `func_type_` = `Opaque`
+   - 将 `role_` 元信息复制到函数 attrs。
+4. **替换作用域**：将原 `HierarchyScopeStmt` 替换为对提取函数的 `Call` +
+   绑定返回值的若干 `AssignStmt`。
+5. **保持父函数类型**：本 Pass 不修改父函数的 `func_type_`。对
+   `CORE_GROUP` 作用域的父函数提升由
+   [`OutlineIncoreScopes`](06-outline_incore_scopes.md) 负责。
+6. **加入程序**：将提取出的函数前置到程序的函数列表中。
+
+**命名规则**：`{原函数名}_{level}_{计数器}`（例如 `main_host_0`、
+`main_global_0`）。若 `HierarchyScopeStmt.name_hint` 非空，则直接使用该
+name_hint。
+
+## 示例
+
+### 非 CORE_GROUP 层级（HOST）
+
+**之前**：
+
+```python
+@pl.program
+class Before:
+    @pl.function  # Opaque
+    def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
+        with pl.at(level=pl.Level.HOST):
+            y = helper(x)
+        return y
+```
+
+**之后**（父函数仍为 `Opaque`，提取函数也是 `Opaque`）：
+
+```python
+@pl.program
+class After:
+    @pl.function  # 未变
+    def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
+        y = self.main_host_0(x)
+        return y
+
+    @pl.function  # Opaque
+    def main_host_0(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
+        y = helper(x)
+        return y
+```
+
+### 多输出
+
+```python
+with pl.at(level=pl.Level.HOST):
+    a_tile = pl.load(a, [0], [64])
+    b_tile = pl.load(b, [0], [64])
+    c_tile = pl.add(a_tile, b_tile)
+    out_a = pl.store(c_tile, [0], out)
+    out_b = pl.mul(c_tile, 2.0)
+# out_a 与 out_b 都在作用域之后被使用
+x = out_a + out_b
+```
+
+提取后的函数体变为：
+
+```python
+out_a, out_b = self.main_host_0(a, b, out)  # 多返回值
+x = out_a + out_b
+```
+
+### CORE_GROUP 作用域会被跳过
+
+```python
+@pl.function  # Opaque
+def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
+    with pl.at(level=pl.Level.CORE_GROUP):   # <-- 本 Pass 不处理
+        tile = pl.load(x, [0], [64])
+        result = pl.store(tile, [0], x)
+    return result
+```
+
+本 Pass 会把上述 `CORE_GROUP` 作用域原样保留。下一个流水线 Pass
+[`OutlineIncoreScopes`](06-outline_incore_scopes.md) 会把它提取为
+`Function(InCore)` 并把父函数提升为 `Orchestration`。
+
+## 实现
+
+**头文件**：`include/pypto/ir/transforms/passes.h`
+
+```cpp
+Pass OutlineHierarchyScopes();
+```
+
+**实现文件**：`src/ir/transforms/outline_hierarchy_scopes.cpp`
+
+- 使用公共 `scope_outline_utils` 计算输入/输出
+- 对每个非 `CORE_GROUP` 作用域构造新的 `Function(Opaque)`
+- 将 `role_` 元信息复制到提取函数的 attrs
+- 从不修改父函数的 `func_type_`
+
+**Python 绑定**：`python/bindings/modules/passes.cpp`
+
+```cpp
+passes.def("outline_hierarchy_scopes", &pass::OutlineHierarchyScopes,
+           "Outline non-CORE_GROUP HierarchyScopeStmt regions into Opaque functions");
+```
+
+**测试**：`tests/ut/ir/transforms/test_outline_hierarchy_scopes.py`
+
+- 测试非 `CORE_GROUP` 作用域 → `Opaque` 函数 + 父函数不变
+- 测试 `CORE_GROUP` 作用域保持原样不被处理
+- 测试输入/输出分析
+- 测试同一父函数中多个非 `CORE_GROUP` 作用域
+- 测试 SSA 保留
+
+## Pass 属性
+
+| 属性 | 值 |
+| ---- | -- |
+| 所需 | `SSAForm` |
+| 产生 | `SSAForm` |
+| 失效 | — |
+
+`HierarchyOutlined` 现由紧随其后的
+[`OutlineIncoreScopes`](06-outline_incore_scopes.md) Pass 产生，它负责
+处理剩余的 `CORE_GROUP` 作用域。
+
+## 流水线位置
+
+```text
+... → ConvertToSSA → NormalizeStmtStructure → FlattenCallExpr →
+OutlineHierarchyScopes → OutlineIncoreScopes → OutlineClusterScopes →
+ConvertTensorToTileOps → ...
+```
diff --git a/docs/zh-cn/dev/passes/05-split_chunked_loops.md b/docs/zh-cn/dev/passes/05-split_chunked_loops.md
deleted file mode 100644
index 0dfd8b4ff..000000000
--- a/docs/zh-cn/dev/passes/05-split_chunked_loops.md
+++ /dev/null
@@ -1,187 +0,0 @@
-# SplitChunkedLoops Pass
-
-将带有 `chunk` 的循环按两种策略之一拆分为嵌套的外层/内层循环。
-
-## 概述
-
-此 Pass 将使用 `chunk=C` 创建的 for 循环转换为嵌套循环：外层循环遍历分块索引，内层循环在每个分块内迭代。支持两种生成策略：
-
-- **`guarded`**（默认）— 发射一个长度为 `ceil(T/C)` 的外层循环和一个长度为 `C` 的内层循环，并用 `if (idx < stop)`（负步长时为 `idx > stop`）包裹循环体。越界迭代变为空操作。只发射一个 kernel。
-- **`leading_full`** — 发射一个长度为 `T/C` 的满块循环加一个长度为 `T % C` 的独立余数循环。发射两个并列循环。
-
-两种策略都在 SSA 转换之后运行，并将 `iter_args` 传播到生成的循环中。
-
-**前置条件**: `TypeChecked`、`SSAForm`。
-
-**使用时机**: 在默认流水线中自动运行，位于 `FlattenCallExpr` 之后、`InterchangeChunkLoops` 之前。在 `with pl.auto_incore():` 作用域内的 `pl.range()`、`pl.parallel()`、`pl.unroll()` 上使用 `chunk=`。`auto_incore` 之外的分块循环不会被拆分。
-
-## API
-
-| C++ | Python | 级别 |
-| --- | ------ | ---- |
-| `pass::SplitChunkedLoops()` | `passes.split_chunked_loops()` | 函数级 |
-
-```python
-from pypto import passes
-result = passes.split_chunked_loops()(program)
-```
-
-## DSL 语法
-
-分块循环必须包裹在 `with pl.auto_incore():` 中：
-
-```python
-with pl.auto_incore():
-    # 默认 (guarded)：单 kernel + if-guard
-    for i in pl.range(10, chunk=5):
-        x = pl.add(x, 1.0)
-
-    # 显式 guarded（与默认等价）
-    for i in pl.parallel(n, chunk=4, chunk_policy="guarded"):
-        x = pl.add(x, 1.0)
-
-    # 显式 leading_full：余数剥离为独立循环
-    for i in pl.range(7, chunk=5, chunk_policy="leading_full"):
-        x = pl.add(x, 1.0)
-
-    # 两种策略都支持 iter_args
-    for i, (s,) in pl.range(10, init_values=(x,), chunk=5):
-        s = pl.add(s, 1.0)
-        s = pl.yield_(s)
-```
-
-## 策略选择
-
-| 场景 | 偏好 `guarded` | 偏好 `leading_full` |
-| ---- | -------------- | ------------------- |
-| 动态 bound（`stop` 非编译期常量） | ✅ —— 单 kernel 保留跨边界的 loop-carried 状态 | ❌ —— 余数 kernel 的 iter_args 只能以 input-only 拷贝方式传入，破坏跨迭代累积 |
-| 静态 bound 且可整除 | guard 稍显冗余 | ✅ —— 无 guard、无余数 |
-| 希望 `pl.auto_incore()` 下 kernel 数量最少 | ✅ | 每个分块循环会生成 2 个 kernel |
-| 希望热点循环内部不存在掩码迭代 | ❌ | ✅ —— 满块无条件执行 |
-
-`guarded` 被设为默认，原因在于：(1) 动态 bound 下能保留 `add_inout()` 累积；(2) 避免 `pl.auto_incore()` 下 kernel 数量翻倍。
-
-## 约束
-
-| 约束 | 原因 |
-| ---- | ---- |
-| `step`、`chunk` 必须为整数常量 | 编译期需要确定值 |
-| `chunk` 必须为正整数 | 非正数的分块大小无效 |
-| `step` 可以为负（下降循环） | `guarded` 会根据步长符号选择判据 |
-| `start`、`stop` 在 `guarded` 下可以是动态表达式 | 迭代次数取 `max(abs(stop - start), 0) / abs(step)` |
-| 分块循环必须在 `pl.auto_incore()` 内 | 仅 `auto_incore` 作用域内的循环会被拆分 |
-| `chunk` 可以与 `init_values` 同时使用 | 两种策略都会将 iter_args 串联到生成的循环 |
-
-## 算法
-
-记 `T = ceil(max(|stop - start|, 0) / |step|)`，`C = chunk`。
-
-### `guarded`（默认）
-
-1. `n_total = ceil(T / C)`。静态 bound 直接计算，动态 bound 用 `(T + C - 1) // C`。
-2. 发射外层循环 `for out_var in [0, n_total)` 与内层循环 `for in_var in [0, C)`。
-3. 计算 `idx = start + (out_var * C + in_var) * step`，并替换到循环体里。
-4. 将访问后的循环体包裹进 `IfStmt`，条件为：
-   - `idx < stop`（当 `step > 0`）
-   - `idx > stop`（当 `step < 0`）
-5. **无 iter_args** —— IfStmt 无 else 分支；被跳过的迭代为空操作。
-6. **有 iter_args** —— IfStmt 的 `return_vars` 作为 phi：then 分支保留用户循环体的末尾 `YieldStmt`（更新后的值），else 分支 yield 未变的 inner iter_args。内层循环的末尾 `YieldStmt` 引用 IfStmt 的 phi 变量，从而在生效与被跳过的迭代之间都能串联循环携带状态。
-
-### `leading_full`
-
-1. `n_full = T // C`，`n_rem = T % C`。
-2. 发射外层 `for out_var in [0, n_full)` 与内层 `for in_var in [0, C)`，`idx = start + (out_var * C + in_var) * step`；若 `n_full == 0` 则跳过。
-3. 若 `n_rem > 0`，发射余数循环 `for rem_var in [0, n_rem)`，`idx = start + (n_full * C + rem_var) * step`。其 `init_values` 链接自外层循环的 `return_vars`（如果没有满块循环，则链接自原始 init 值）。
-4. 将原始 `return_vars` 重映射到最终循环的 `return_vars`。
-
-两种路径都在内层与外层/余数循环上保留原始的 `ForKind`（Sequential、Parallel、Unroll）。
-
-## 自动命名缩写
-
-打印出来的 IR 使用紧凑的自动命名格式 `base__qualifier_role_vN`。缩写 qualifier：
-
-| 缩写 | 含义 | 发射时机 |
-| ---- | ---- | -------- |
-| `co` | chunk_outer | 两种策略 |
-| `ci` | chunk_inner | 两种策略 |
-| `cr` | chunk_rem（余数） | 仅 `leading_full` |
-| `cg` | chunk_guard（IfStmt phi） | 仅带 iter_args 的 `guarded` |
-
-示例：`i__co_idx_v0`（外层索引）、`x__ci_iter_v1`（内层 iter_arg）、`x__cr_rv_v1`（余数 return var）、`x__cg_rv_v1`（IfStmt phi 变量）。
-
-## 示例
-
-### `guarded`，可整除（`chunk=5`，trip_count=10）
-
-**之后**：
-
-```python
-for i__co_idx_v0, (x__co_iter_v1,) in pl.range(2, init_values=(x__ssa_v0,)):
-    for i__ci_idx_v0, (x__ci_iter_v1,) in pl.range(5, init_values=(x__co_iter_v1,)):
-        if i__co_idx_v0 * 5 + i__ci_idx_v0 < 10:
-            x__ssa_v3 = pl.tensor.add(x__ci_iter_v1, 1.0)
-            x__cg_rv_v1 = pl.yield_(x__ssa_v3)
-        else:
-            x__cg_rv_v1 = pl.yield_(x__ci_iter_v1)
-        x__ci_rv_v1 = pl.yield_(x__cg_rv_v1)
-    x__co_rv_v1 = pl.yield_(x__ci_rv_v1)
-return x__co_rv_v1
-```
-
-### `guarded`，动态 bound（`chunk=4`，`stop=n`）
-
-**之后**（单 kernel，`n_total = (n + 3) // 4`）：
-
-```python
-for i__co_idx_v0, (x__co_iter_v1,) in pl.range((n + 3) // 4, init_values=(x__ssa_v0,)):
-    for i__ci_idx_v0, (x__ci_iter_v1,) in pl.range(4, init_values=(x__co_iter_v1,)):
-        if i__co_idx_v0 * 4 + i__ci_idx_v0 < n:
-            x__ssa_v3 = pl.tensor.add(x__ci_iter_v1, 1.0)
-            x__cg_rv_v1 = pl.yield_(x__ssa_v3)
-        else:
-            x__cg_rv_v1 = pl.yield_(x__ci_iter_v1)
-        x__ci_rv_v1 = pl.yield_(x__cg_rv_v1)
-    x__co_rv_v1 = pl.yield_(x__ci_rv_v1)
-return x__co_rv_v1
-```
-
-### `leading_full`，不可整除（`chunk=5`，trip_count=7）
-
-**之后**（两个并列循环）：
-
-```python
-for i__co_idx_v0, (x__co_iter_v1,) in pl.range(1, init_values=(x__ssa_v0,)):
-    for i__ci_idx_v0, (x__ci_iter_v1,) in pl.range(5, init_values=(x__co_iter_v1,)):
-        x__ssa_v3 = pl.tensor.add(x__ci_iter_v1, 1.0)
-        x__ci_rv_v1 = pl.yield_(x__ssa_v3)
-    x__co_rv_v1 = pl.yield_(x__ci_rv_v1)
-for i__cr_idx_v0, (x__cr_iter_v1,) in pl.range(2, init_values=(x__co_rv_v1,)):
-    x__ssa_v4 = pl.tensor.add(x__cr_iter_v1, 1.0)
-    x__cr_rv_v1 = pl.yield_(x__ssa_v4)
-return x__cr_rv_v1
-```
-
-## LoopOrigin 标记
-
-| LoopOrigin | 说明 | 发射时机 |
-| ---------- | ---- | -------- |
-| `Original` | 普通用户循环（默认） | — |
-| `ChunkOuter` | 遍历分块索引的外层循环 | 两种策略 |
-| `ChunkInner` | 在分块内迭代的内层循环 | 两种策略 |
-| `ChunkRemainder` | 处理剩余迭代的余数循环 | 仅 `leading_full` |
-
-通过 `for_stmt.attrs.get("loop_origin")`（Python）或 `for_stmt->GetAttr<LoopOrigin>("loop_origin")`（C++）访问。
-
-## 流水线位置
-
-```text
-UnrollLoops → ConvertToSSA → FlattenCallExpr → SplitChunkedLoops → InterchangeChunkLoops → OutlineIncoreScopes → ...
-```
-
-## Pass 属性
-
-| 属性 | 值 |
-| ---- | -- |
-| Required | `TypeChecked`、`SSAForm` |
-| Produced | `TypeChecked`、`SSAForm` |
-| Invalidated | （无） |
diff --git a/docs/zh-cn/dev/passes/06-interchange_chunk_loops.md b/docs/zh-cn/dev/passes/06-interchange_chunk_loops.md
deleted file mode 100644
index a50ebb691..000000000
--- a/docs/zh-cn/dev/passes/06-interchange_chunk_loops.md
+++ /dev/null
@@ -1,197 +0,0 @@
-# InterchangeChunkLoops Pass
-
-重新排列嵌套的 ChunkOuter/ChunkInner 循环对并插入 `InCore` 作用域，为下游提取做准备。
-
-## 概述
-
-在 `SplitChunkedLoops` 将分块循环拆分为嵌套的 `ChunkOuter→ChunkInner` 对之后，嵌套分块循环的结构为：
-
-```text
-i_out[ChunkOuter] → i_in[ChunkInner,Parallel] → j_out[ChunkOuter] → j_in[ChunkInner,Parallel] → body
-```
-
-此 Pass 重新排列，使所有外层循环在顶部，并将内层循环 + 循环体包裹在 `InCoreScopeStmt` 中：
-
-```text
-i_out[ChunkOuter] → j_out[ChunkOuter] → InCore{ i_in[ChunkInner] → j_in[ChunkInner] → body }
-```
-
-**前置条件**: TypeChecked、SSAForm 属性。
-
-**使用时机**: 在默认流水线中自动运行，位于 `SplitChunkedLoops` 之后、`OutlineIncoreScopes` 之前。仅处理 `pl.auto_incore()` 作用域内的循环。此 Pass 会消费（移除）`AutoInCore` 作用域。
-
-## API
-
-| C++ | Python | 级别 |
-| --- | ------ | ---- |
-| `pass::InterchangeChunkLoops()` | `passes.interchange_chunk_loops()` | 函数级 |
-
-**Python 用法**:
-
-```python
-from pypto import passes
-
-result = passes.interchange_chunk_loops()(program)
-```
-
-## 约束
-
-| 约束 | 行为 |
-| ---- | ---- |
-| 仅 SSA | 在 `SplitChunkedLoops` 之后运行（需要 `SSAForm`） |
-| 仅并行交换 | 仅当所有 ChunkInner 循环具有 `ForKind::Parallel` 时才交换 |
-| 顺序分块循环 | 不交换，但如果在 `auto_incore` 内则包裹在 InCore 中 |
-| 已有 InCore | 如果链体已包含 `InCoreScopeStmt`，则跳过 |
-| 需要 `auto_incore` 作用域 | 仅处理 `AutoInCoreScopeStmt` 内的循环；该作用域会被消费 |
-
-## 算法
-
-1. **收集链** — 从 `ChunkOuter` ForStmt 开始，遍历嵌套的 ForStmt 体。构建 `(ForStmt, LoopOrigin)` 条目列表。在遇到非 ForStmt、`Original` 循环或 `ScopeStmt` 时停止。
-
-2. **守卫检查** — 验证所有 ChunkInner 循环为 Parallel。检查最内层循环体中无已有 InCore 作用域。
-
-3. **分离** — 将链分为 `outers`（ChunkOuter）和 `inners`（ChunkInner）。
-
-4. **重建**（由内到外构建）：
-   - 访问最内层循环体
-   - 将 inners 包裹在循环体外（保持顺序），重新连接 iter_args
-   - 包裹在 `InCoreScopeStmt` 中
-   - 将 outers 包裹在 InCore 外（保持顺序），重新连接 iter_args 和 yields
-
-5. **处理余数** — `ChunkRemainder` 循环：递归进入循环体。将独立的并行余数子循环包裹在 InCore 中。
-
-## 自动命名缩写
-
-下面示例里的变量名使用了 `base__qualifier_role_vN` 这一紧凑格式，其中 qualifier 有若干缩写：
-
-| 缩写 | 含义 |
-| ---- | ---- |
-| `co` | `chunk_outer` |
-| `ci` | `chunk_inner` |
-| `cr` | `chunk_rem` / 余数分块 |
-| `lN` | interchange 之后的第 `N` 层循环 |
-
-示例：
-
-- `x__co_iter_v1`：交换前的外层分块 iter_arg
-- `x__co_l0_iter_v1`：交换后第 0 层循环上传递的 iter_arg
-- `x__co_l2_rv_v1`：从重排后第 2 层循环流出的 return var
-
-像 `iter`、`rv`、`idx`、`ssa` 这样的 role 不再继续缩写，以便变量用途仍然一眼可见。
-
-## 示例
-
-**之前**（SplitChunkedLoops 之后，全并行）：
-
-```python
-for i__co_idx_v0, (x__co_iter_v1,) in pl.range(2, init_values=(x__ssa_v0,)):  # ChunkOuter
-    for i__ci_idx_v0, (x__ci_iter_v1,) in pl.parallel(
-        4, init_values=(x__co_iter_v1,)
-    ):  # ChunkInner
-        for j__co_idx_v0, (y__co_iter_v1,) in pl.range(
-            3, init_values=(x__ci_iter_v1,)
-        ):  # ChunkOuter
-            for j__ci_idx_v0, (y__ci_iter_v1,) in pl.parallel(
-                4, init_values=(y__co_iter_v1,)
-            ):  # ChunkInner
-                z = pl.add(y__ci_iter_v1, 1.0)
-                y__ci_rv_v1 = pl.yield_(z)
-            y__co_rv_v1 = pl.yield_(y__ci_rv_v1)
-        x__ci_rv_v1 = pl.yield_(y__co_rv_v1)
-    x__co_rv_v1 = pl.yield_(x__ci_rv_v1)
-return x__co_rv_v1
-```
-
-**之后**（InterchangeChunkLoops）：
-
-```python
-for i__co_idx_v0, (x__co_l0_iter_v1,) in pl.range(
-    2, init_values=(x__ssa_v0,)
-):  # ChunkOuter
-    for j__co_idx_v0, (x__co_l1_iter_v1,) in pl.range(
-        3, init_values=(x__co_l0_iter_v1,)
-    ):  # ChunkOuter
-        with pl.incore():                                               # 插入 InCore
-            for i__ci_idx_v0, (x__co_l2_iter_v1,) in pl.parallel(
-                4, init_values=(x__co_l1_iter_v1,)
-            ):  # ChunkInner
-                for j__ci_idx_v0, (x__co_l3_iter_v1,) in pl.parallel(
-                    4, init_values=(x__co_l2_iter_v1,)
-                ):  # ChunkInner
-                    z = pl.add(x__co_l3_iter_v1, 1.0)
-                    x__co_l3_rv_v1 = pl.yield_(z)
-                x__co_l2_rv_v1 = pl.yield_(x__co_l3_rv_v1)
-        x__co_l1_rv_v1 = pl.yield_(x__co_l2_rv_v1)
-    x__co_l0_rv_v1 = pl.yield_(x__co_l1_rv_v1)
-return x__co_l0_rv_v1
-```
-
-## 余数处理
-
-对于不整除的迭代次数，余数循环会被包裹在 InCore 中：
-
-```python
-for i_rem, (...) in pl.parallel(2, init_values=(...)):   # ChunkRemainder
-    for j_out, (...) in pl.range(3, init_values=(...)):   # 已应用交换
-        with pl.incore():
-            for j_in, (...) in pl.parallel(4, init_values=(...)):
-                body
-    with pl.incore():                                            # 余数已包裹
-        for j_rem, (...) in pl.parallel(2, init_values=(...)):
-            body
-```
-
-## 非分块语句处理
-
-当 `auto_incore` 被消费时，未被分块交换处理的语句（独立张量算子、非分块循环、未通过并行守卫检查的顺序分块循环）会被包裹在 `InCoreScopeStmt` 中，以确保它们被 `OutlineIncoreScopes` 提取到 InCore 函数中。
-
-连续的非 InCore 语句会被分组到单个 `InCoreScopeStmt` 中。控制流语句（`YieldStmt`、`ReturnStmt`）和纯标量赋值（例如索引运算 `offset = ob * 32`）不会被包裹——它们留在编排作用域中。
-
-**示例** — 独立算子 + 并行分块：
-
-```python
-# 之前（在 auto_incore 内部，SplitChunkedLoops 之后）
-with pl.auto_incore():
-    x = pl.add(x, 1.0)                           # 独立算子
-    for i_out in pl.range(2):                     # ChunkOuter（并行内层）
-        for i_in in pl.parallel(4):
-            x = pl.add(x, 2.0)
-
-# InterchangeChunkLoops 之后
-with pl.incore():                                 # 独立算子已包裹
-    x = pl.add(x, 1.0)
-for i_out in pl.range(2):                         # 已交换的分块
-    with pl.incore():
-        for i_in in pl.parallel(4):
-            x = pl.add(x, 2.0)
-```
-
-**示例** — 顺序分块（未通过交换守卫检查）：
-
-```python
-# 之前
-with pl.auto_incore():
-    for i_out in pl.range(2):                     # ChunkOuter（顺序内层）
-        for i_in in pl.range(4):                  # ChunkInner，Sequential → 未通过守卫
-            x = pl.add(x, 1.0)
-
-# 之后 — 整个链被包裹在 InCore 中
-with pl.incore():
-    for i_out in pl.range(2):
-        for i_in in pl.range(4):
-            x = pl.add(x, 1.0)
-```
-
-## 流水线位置
-
-```text
-UnrollLoops → ConvertToSSA → FlattenCallExpr → SplitChunkedLoops → InterchangeChunkLoops → OutlineIncoreScopes → ...
-```
-
-## Pass 属性
-
-| 属性 | 值 |
-| ---- | -- |
-| Required | `TypeChecked`、`SSAForm` |
-| Produced | `TypeChecked`、`SSAForm` |
-| Invalidated | （无） |
diff --git a/docs/zh-cn/dev/passes/06-outline_incore_scopes.md b/docs/zh-cn/dev/passes/06-outline_incore_scopes.md
new file mode 100644
index 000000000..b0164195e
--- /dev/null
+++ b/docs/zh-cn/dev/passes/06-outline_incore_scopes.md
@@ -0,0 +1,217 @@
+# OutlineIncoreScopes Pass
+
+将 `level_ == CORE_GROUP` 的 `HierarchyScopeStmt` 区域提取为独立的
+`Function(InCore)` 定义，并把外层父函数由 `Opaque` 提升为 `Orchestration`。
+
+## 概述
+
+该 Pass 专门处理 `HierarchyScopeStmt` 的 `CORE_GROUP` 形式 —— 即由
+`with pl.at(level=pl.Level.CORE_GROUP):` 引入的 per-core-group 内核区域。
+对每个此类作用域，它都会提取出一个新的 `Function`，`func_type_` 为
+`FunctionType::InCore`，并将原作用域替换为对该函数的 `Call`。只要从某个
+父函数中提取出至少一个 `CORE_GROUP` 作用域，就把该父函数的 `func_type_`
+由 `Opaque` 提升为 `Orchestration`。
+
+本 Pass 是 [`OutlineHierarchyScopes`](05-outline_hierarchy_scopes.md) 在
+`CORE_GROUP` 方向的对应 Pass，后者只处理非 `CORE_GROUP` 层级，生成
+`Function(Opaque)` 且不修改父函数类型。
+
+| 作用域 `level_` | 提取出的函数类型 | 父函数类型（Pass 后） |
+| --------------- | ---------------- | --------------------- |
+| `Level.CORE_GROUP` | `FunctionType::InCore` | `Opaque` 提升为 `Orchestration` |
+| 其他层级 | *（本 Pass 不处理；已由 `OutlineHierarchyScopes` 提取）* | — |
+
+当 `CORE_GROUP` 作用域携带 `split_` 优化提示时，会把该提示复制到提取出的
+`InCore` 函数 attrs 中，供下游 Pass（特别是
+[`ExpandMixedKernel`](11-expand_mixed_kernel.md)）在决定如何拆分 AIC /
+AIV 核时使用。
+
+**前置条件**：
+
+- 输入 IR 必须为 SSA 形式（需先运行 `ConvertToSSA`）。本 Pass 保留
+  （产生）SSA 形式。
+- 期望 `OutlineHierarchyScopes` 已经运行过，因此当前只剩下 `CORE_GROUP`
+  的 `HierarchyScopeStmt` 节点需要处理。
+- 仅处理 `Opaque` 函数（其中可能残留 `CORE_GROUP` 作用域）。已经为
+  `Orchestration`、`InCore`、`AIC`、`AIV`、`Group` 的函数保持不变。
+
+**使用时机**：在 [`OutlineHierarchyScopes`](05-outline_hierarchy_scopes.md)
+之后、[`OutlineClusterScopes`](07-outline_cluster_scopes.md) 之前运行。
+本 Pass 完成后，`HierarchyOutlined` 属性成立：`Opaque` / `Orchestration`
+函数中不再残留任何 `HierarchyScopeStmt` 节点。
+
+## API
+
+| C++ | Python | 级别 |
+| --- | ------ | ---- |
+| `pass::OutlineIncoreScopes()` | `passes.outline_incore_scopes()` | 程序级 |
+
+**工厂函数**：
+
+```cpp
+Pass OutlineIncoreScopes();
+```
+
+**Python 用法**：
+
+```python
+from pypto.pypto_core import passes
+
+outline_pass = passes.outline_incore_scopes()
+program_outlined = outline_pass(program)
+```
+
+## 算法
+
+1. **扫描 CORE_GROUP 作用域**：在每个 `Opaque` 函数体中查找所有 `level_`
+   为 `CORE_GROUP` 的 `HierarchyScopeStmt` 节点。
+2. **分析输入/输出**：复用 scope_outline_utils 辅助工具计算外部定义、内部
+   使用的变量（输入）以及内部定义、外部使用的变量（输出）。
+3. **创建 InCore 函数**：将作用域体提取为新的 `Function`：
+   - 参数 = 输入变量
+   - 返回值 = 输出变量
+   - 函数体 = 作用域体
+   - `func_type_` = `InCore`
+   - 将 `role_` 复制到函数 attrs
+   - 若作用域携带 `split_` 优化提示，将其复制到函数的 `split` attr
+     （由 `ExpandMixedKernel` 消费）
+4. **替换作用域**：将原 `HierarchyScopeStmt` 替换为对提取出 InCore 函数的
+   `Call` + 绑定返回值的若干 `AssignStmt`。
+5. **父函数提升**：若父函数中至少有一个 `CORE_GROUP` 作用域被提取，则将
+   该父函数由 `Opaque` 重标记为 `Orchestration`。
+6. **加入程序**：将提取出的 InCore 函数前置到程序的函数列表中。
+
+**命名规则**：`{原函数名}_core_group_{计数器}`（例如
+`main_core_group_0`）。提取出的 InCore 函数在 attrs 中使用 `_incore_`
+风格的名称后缀，在打印的 IR 中便于识别。若
+`HierarchyScopeStmt.name_hint` 非空，则直接使用该 name_hint。
+
+## 示例
+
+### CORE_GROUP → InCore + Orchestration
+
+**之前**（假设 `OutlineHierarchyScopes` 已完成，非 CORE_GROUP 作用域已经
+被提取；CORE_GROUP 作用域仍内联在 `main` 中）：
+
+```python
+@pl.program
+class Before:
+    @pl.function  # Opaque
+    def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
+        y = x + 1
+
+        with pl.at(level=pl.Level.CORE_GROUP):
+            tile = pl.load(y, [0], [64])
+            tile_sq = pl.mul(tile, tile)
+            result_tile = tile_sq + 1
+            result = pl.store(result_tile, [0], x)
+
+        z = result + 2
+        return z
+```
+
+**之后**：
+
+```python
+@pl.program
+class After:
+    @pl.function(type=pl.FunctionType.Orchestration)  # 已升级
+    def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
+        y = x + 1
+        result = self.main_core_group_0(y, x)  # 调用提取出的 InCore 函数
+        z = result + 2
+        return z
+
+    @pl.function(type=pl.FunctionType.InCore)  # 提取出
+    def main_core_group_0(self, y: pl.Tensor[[64], pl.FP32],
+                          x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
+        tile = pl.load(y, [0], [64])
+        tile_sq = pl.mul(tile, tile)
+        result_tile = tile_sq + 1
+        result = pl.store(result_tile, [0], x)
+        return result
+```
+
+### 带 split 提示的 CORE_GROUP
+
+```python
+with pl.at(level=pl.Level.CORE_GROUP,
+           optimizations=[pl.split(pl.SplitMode.UP_DOWN)]):
+    ...
+```
+
+提取出的 `InCore` 函数 attrs 中会携带该 `split` 提示，供后续
+`ExpandMixedKernel` 读取以决定 AIC+AIV 拆分方式。
+
+### 多输出
+
+```python
+with pl.at(level=pl.Level.CORE_GROUP):
+    a_tile = pl.load(a, [0], [64])
+    b_tile = pl.load(b, [0], [64])
+    c_tile = pl.add(a_tile, b_tile)
+    out_a = pl.store(c_tile, [0], out)
+    out_b = pl.mul(c_tile, 2.0)
+# out_a 与 out_b 都在作用域之后被使用
+x = out_a + out_b
+```
+
+提取后，父函数体变为：
+
+```python
+out_a, out_b = self.main_core_group_0(a, b, out)  # 多返回值
+x = out_a + out_b
+```
+
+## 实现
+
+**头文件**：`include/pypto/ir/transforms/passes.h`
+
+```cpp
+Pass OutlineIncoreScopes();
+```
+
+**实现文件**：`src/ir/transforms/outline_incore_scopes.cpp`
+
+- 使用公共 `scope_outline_utils` 计算输入/输出
+- 对每个 `CORE_GROUP` 作用域构造新的 `Function(InCore)`
+- 将 `role_` / `split_` 元信息复制到提取函数的 attrs
+- 当从某父函数中至少提取出一个 `CORE_GROUP` 作用域时，将该父函数
+  由 `Opaque` 重标记为 `Orchestration`
+
+**Python 绑定**：`python/bindings/modules/passes.cpp`
+
+```cpp
+passes.def("outline_incore_scopes", &pass::OutlineIncoreScopes,
+           "Outline CORE_GROUP HierarchyScopeStmt regions into Function(InCore) "
+           "and promote the parent function to Orchestration");
+```
+
+**测试**：`tests/ut/ir/transforms/test_outline_incore_scopes.py`
+
+- 测试 `CORE_GROUP` 作用域 → `InCore` 函数 + 父函数升级为 `Orchestration`
+- 测试 `split_` 透传到提取出的 InCore 函数
+- 测试输入/输出分析
+- 测试同一父函数中多个 `CORE_GROUP` 作用域
+- 测试 SSA 保留
+
+## Pass 属性
+
+| 属性 | 值 |
+| ---- | -- |
+| 所需 | `SSAForm` |
+| 产生 | `SSAForm`, `HierarchyOutlined` |
+| 失效 | — |
+
+`HierarchyOutlined` 由本 Pass 产生（而非
+[`OutlineHierarchyScopes`](05-outline_hierarchy_scopes.md)）：两次 outline
+Pass 全部结束后，`Opaque`/`Orchestration` 函数中不再残留任何
+`HierarchyScopeStmt` 节点。
+
+## 流水线位置
+
+```text
+... → ConvertToSSA → NormalizeStmtStructure → FlattenCallExpr →
+OutlineHierarchyScopes → OutlineIncoreScopes → OutlineClusterScopes →
+ConvertTensorToTileOps → ...
+```
diff --git a/docs/zh-cn/dev/passes/08-outline_cluster_scopes.md b/docs/zh-cn/dev/passes/07-outline_cluster_scopes.md
similarity index 72%
rename from docs/zh-cn/dev/passes/08-outline_cluster_scopes.md
rename to docs/zh-cn/dev/passes/07-outline_cluster_scopes.md
index f295828b5..0c517757e 100644
--- a/docs/zh-cn/dev/passes/08-outline_cluster_scopes.md
+++ b/docs/zh-cn/dev/passes/07-outline_cluster_scopes.md
@@ -11,7 +11,7 @@
 - 输入 IR 必须为静态单赋值 (SSA) 形式（需先运行 ConvertToSSA）
 - 仅处理 Opaque 和 Orchestration 函数
 
-**使用时机**：在 `OutlineIncoreScopes` 之后运行，当 IR 包含需要提取的 `with pl.cluster():` 作用域或 standalone `with pl.spmd(...):` 作用域时使用。
+**使用时机**：在 `OutlineHierarchyScopes` 和 `OutlineIncoreScopes` 之后运行，当 IR 包含需要提取的 `with pl.cluster():` 作用域或 standalone `with pl.spmd(...):` 作用域时使用。Cluster 体内可能仍包含由 `OutlineIncoreScopes` 先前生成的 `Function(InCore)` 调用。
 
 ## API
 
@@ -42,7 +42,9 @@ program_outlined = outline_pass(program)
 
 ## 示例
 
-**之前**：
+**之前**（假设 `OutlineIncoreScopes` 已经把内层的
+`with pl.at(level=pl.Level.CORE_GROUP): ...` 作用域提取为 `Function(InCore)`
+`main_core_group_0`）：
 
 ```python
 @pl.program
@@ -50,8 +52,7 @@ class Before:
     @pl.function
     def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
         with pl.cluster():
-            with pl.incore():
-                y: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
+            y: pl.Tensor[[64], pl.FP32] = self.main_core_group_0(x)
         return y
 ```
 
@@ -62,8 +63,7 @@ class Before:
 class After:
     @pl.function(type=pl.FunctionType.Group)
     def main_cluster_0(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-        with pl.incore():
-            y: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
+        y: pl.Tensor[[64], pl.FP32] = self.main_core_group_0(x)
         return y
 
     @pl.function
@@ -72,7 +72,9 @@ class After:
         return y
 ```
 
-注意：Cluster 内部的 InCore 作用域在提取的 Group 函数中被保留。可以先运行 `OutlineIncoreScopes` 提取 InCore 作用域再进行聚簇，也可以之后在 Group 函数内提取。
+注意：`OutlineHierarchyScopes` 与 `OutlineIncoreScopes` 均先于本 Pass 运行，
+因此 Cluster 体内已经是对 `Function(InCore)` 的调用，而非内联的
+`HierarchyScopeStmt` 节点。提取出的 Group 函数会保留这些调用。
 
 ## Standalone Spmd 示例
 
@@ -132,12 +134,12 @@ class After:
 | 产生 | SSAForm, ClusterOutlined |
 | 失效 | — |
 
-## 与 OutlineIncoreScopes 的关系
+## 与 OutlineHierarchyScopes / OutlineIncoreScopes 的关系
 
-| 方面 | OutlineIncoreScopes | OutlineClusterScopes |
-| ---- | ------------------- | -------------------- |
-| 作用域类型 | `ScopeKind::InCore` | `ScopeKind::Cluster` / standalone `ScopeKind::Spmd` |
-| 输出函数类型 | `FunctionType::InCore` | `FunctionType::Group` / `FunctionType::Spmd` |
-| 命名模式 | `{func}_incore_{n}` | `{func}_cluster_{n}` / `{func}_spmd_{n}` |
-| 提升父函数为 | Orchestration | *（不变）* |
-| 处理对象 | 仅 Opaque 函数 | Opaque + Orchestration |
+| 方面 | OutlineHierarchyScopes | OutlineIncoreScopes | OutlineClusterScopes |
+| ---- | ---------------------- | ------------------- | -------------------- |
+| 作用域类型 | `HierarchyScopeStmt`（非 CORE_GROUP） | `HierarchyScopeStmt`（CORE_GROUP） | `ClusterScopeStmt` / standalone `SpmdScopeStmt` |
+| 输出函数类型 | `FunctionType::Opaque` | `FunctionType::InCore` | `FunctionType::Group` / `FunctionType::Spmd` |
+| 命名模式 | `{func}_{level}_{n}` | `{func}_core_group_{n}` | `{func}_cluster_{n}` / `{func}_spmd_{n}` |
+| 提升父函数为 | *（不变）* | `Orchestration` | *（不变）* |
+| 处理对象 | 仅 `Opaque` 函数 | 仅 `Opaque` 函数 | `Opaque` + `Orchestration` |
diff --git a/docs/zh-cn/dev/passes/07-outline_incore_scopes.md b/docs/zh-cn/dev/passes/07-outline_incore_scopes.md
deleted file mode 100644
index 90d45bca8..000000000
--- a/docs/zh-cn/dev/passes/07-outline_incore_scopes.md
+++ /dev/null
@@ -1,173 +0,0 @@
-# OutlineIncoreScopes Pass
-
-将 InCore 作用域提取为独立函数。
-
-## 概述
-
-该 Pass 将 `InCoreScopeStmt` 节点变换为独立的 `Function(InCore)` 定义，并将原作用域替换为对提取函数的调用。
-
-**前置条件**：
-
-- 输入 IR 必须为静态单赋值 (SSA) 形式（需先运行 ConvertToSSA）；该 Pass 保持（产生）SSAForm
-- 仅处理 Opaque 函数（InCore 函数保持不变）
-
-**使用时机**：在 ConvertToSSA 之后运行，当需要将 InCore 计算区域提取为独立的可调用函数时使用。
-
-## API
-
-| C++ | Python | 级别 |
-| --- | ------ | ---- |
-| `pass::OutlineIncoreScopes()` | `passes.outline_incore_scopes()` | 程序级 |
-
-**工厂函数**：
-
-```cpp
-Pass OutlineIncoreScopes();
-```
-
-**Python 用法**：
-
-```python
-from pypto.pypto_core import passes
-
-outline_pass = passes.outline_incore_scopes()
-program_outlined = outline_pass(program)
-```
-
-## 算法
-
-1. **扫描 InCore 作用域**：在 Opaque 函数中查找所有 `InCoreScopeStmt` 节点
-2. **分析输入**：确定外部变量引用（在作用域外定义、在作用域内使用的变量）
-3. **分析输出**：确定在作用域之后仍被使用的内部定义（在作用域内定义、在作用域外使用的变量）
-4. **创建函数**：将作用域体提取为新的 `Function(scope_type=InCore)`，其中：
-   - 参数 = 输入变量
-   - 返回值 = 输出变量
-   - 函数体 = 作用域体
-5. **替换作用域**：将 `InCoreScopeStmt` 替换为：
-   - 带有输入参数的提取函数调用
-   - 每个输出变量对应一个 AssignStmt
-6. **添加到程序**：将提取的函数添加到程序的函数列表中
-
-**命名规则**：
-
-- 默认：`{原函数名}_incore_{计数器}`（如 `main_incore_0`、`main_incore_1`）
-- 用户自定义：当 `InCoreScopeStmt.name_hint` 非空时，直接使用该名称
-  - `with pl.incore(name_hint="fused_add"):` → 函数名为 `fused_add`
-
-## 示例
-
-### 基本提取
-
-**之前**：
-
-```python
-@pl.program
-class Before:
-    @pl.function  # Opaque function
-    def main(self, x: Tensor[[64], FP32]) -> Tensor[[64], FP32]:
-        y = x + 1
-
-        with pl.incore():  # InCore scope
-            tile = pl.load(y, [0], [64])
-            tile_sq = pl.mul(tile, tile)
-            result_tile = tile_sq + 1
-            result = pl.store(result_tile, [0], x)
-
-        z = result + 2
-        return z
-```
-
-**之后**：
-
-```python
-@pl.program
-class After:
-    @pl.function  # Opaque function
-    def main(self, x: Tensor[[64], FP32]) -> Tensor[[64], FP32]:
-        y = x + 1
-
-        # Scope replaced with call + assignments
-        result = self.main_incore_0(y, x)  # Call outlined function
-
-        z = result + 2
-        return z
-
-    @pl.function(scope_type=InCore)  # Outlined InCore function
-    def main_incore_0(self, y: Tensor[[64], FP32], x: Tensor[[64], FP32]) -> Tensor[[64], FP32]:
-        # Scope body moved here
-        tile = pl.load(y, [0], [64])
-        tile_sq = pl.mul(tile, tile)
-        result_tile = tile_sq + 1
-        result = pl.store(result_tile, [0], x)
-        return result
-```
-
-### 多输出
-
-**之前**：
-
-```python
-with pl.incore():
-    a_tile = pl.load(a, [0], [64])
-    b_tile = pl.load(b, [0], [64])
-    c_tile = pl.add(a_tile, b_tile)
-    out_a = pl.store(c_tile, [0], out)
-    out_b = pl.mul(c_tile, 2.0)
-# Both out_a and out_b used after scope
-x = out_a + out_b
-```
-
-**之后**：
-
-```python
-out_a, out_b = self.main_incore_0(a, b, out)  # Multiple outputs
-x = out_a + out_b
-
-# Outlined function:
-def main_incore_0(self, a, b, out):
-    a_tile = pl.load(a, [0], [64])
-    b_tile = pl.load(b, [0], [64])
-    c_tile = pl.add(a_tile, b_tile)
-    out_a = pl.store(c_tile, [0], out)
-    out_b = pl.mul(c_tile, 2.0)
-    return (out_a, out_b)
-```
-
-## 实现
-
-**头文件**：`include/pypto/ir/transforms/passes.h`
-
-```cpp
-Pass OutlineIncoreScopes();
-```
-
-**实现文件**：`src/ir/transforms/outline_incore_scopes.cpp`
-
-- 使用 SSA 分析确定输入/输出
-- 创建带有 InCore 作用域类型的新 Function 节点
-- 将 InCoreScopeStmt 替换为 Call + AssignStmt
-- 管理函数命名和计数器
-
-**Python 绑定**：`python/bindings/modules/passes.cpp`
-
-```cpp
-passes.def("outline_incore_scopes", &pass::OutlineIncoreScopes, "Outline InCore scopes");
-```
-
-**测试**：`tests/ut/ir/transforms/test_outline_incore_scopes.py`
-
-- 测试基本作用域提取
-- 测试输入/输出分析
-- 测试同一函数中的多个作用域
-- 测试嵌套作用域
-- 测试 SSA 保持
-
-## 前置条件
-
-**需要 SSA 形式**：该 Pass 依赖 SSA 属性 (Property)：
-
-- 单赋值确保清晰的输入/输出分析
-- 无变量遮蔽简化了作用域分析
-- 控制流中的 YieldStmt 被正确处理
-
-如果 IR 不是 SSA 形式，**请先运行 ConvertToSSA**。
diff --git a/docs/zh-cn/dev/passes/09-convert_tensor_to_tile_ops.md b/docs/zh-cn/dev/passes/08-convert_tensor_to_tile_ops.md
similarity index 90%
rename from docs/zh-cn/dev/passes/09-convert_tensor_to_tile_ops.md
rename to docs/zh-cn/dev/passes/08-convert_tensor_to_tile_ops.md
index 22097f490..02c114d8a 100644
--- a/docs/zh-cn/dev/passes/09-convert_tensor_to_tile_ops.md
+++ b/docs/zh-cn/dev/passes/08-convert_tensor_to_tile_ops.md
@@ -4,14 +4,14 @@
 
 ## 概述
 
-`OutlineIncoreScopes` 将 InCore 作用域提取为独立函数后，这些函数仍使用 `TensorType` 变量和 `tensor.*` 操作。本 pass 将其降级为直接映射到 PTO-ISA 指令的 `TileType` 变量和 `tile.*` 操作。
+`OutlineHierarchyScopes` 和 `OutlineIncoreScopes` 将 `HierarchyScopeStmt` 区域提取为独立函数（其中 `OutlineIncoreScopes` 对 `CORE_GROUP` 作用域产生 `Function(InCore)`）后，这些 InCore 函数仍使用 `TensorType` 变量和 `tensor.*` 操作。本 pass 将其降级为直接映射到 PTO-ISA 指令的 `TileType` 变量和 `tile.*` 操作。
 
 本 pass 还会更新编排/不透明函数中的调用点：为 InCore 函数新增的每个输出参数，在调用点插入 `tensor.create`。
 
 **前置条件**：
 
 - 输入 IR 必须为 SSA 形式
-- InCore 作用域必须已提取（需先运行 `OutlineIncoreScopes`）
+- Hierarchy 作用域必须已提取为独立函数（需先运行 `OutlineHierarchyScopes` 和 `OutlineIncoreScopes`）
 - 语句结构必须已规范化
 
 **使用时机**：在 `OutlineClusterScopes` 之后、`OptimizeOrchTensors` 之前运行。
@@ -119,7 +119,7 @@ class After:
 
 | 属性 | 值 |
 | ---- | -- |
-| Required | SSAForm, SplitIncoreOrch, NormalizedStmtStructure |
+| Required | SSAForm, HierarchyOutlined, NormalizedStmtStructure |
 | Produced | SSAForm, IncoreTileOps, NormalizedStmtStructure |
 | Invalidated | — |
 
diff --git a/docs/zh-cn/dev/passes/10-optimize_orch_tensors.md b/docs/zh-cn/dev/passes/09-optimize_orch_tensors.md
similarity index 98%
rename from docs/zh-cn/dev/passes/10-optimize_orch_tensors.md
rename to docs/zh-cn/dev/passes/09-optimize_orch_tensors.md
index aa0ac9438..e54867d42 100644
--- a/docs/zh-cn/dev/passes/10-optimize_orch_tensors.md
+++ b/docs/zh-cn/dev/passes/09-optimize_orch_tensors.md
@@ -132,8 +132,8 @@ class After:
 
 | 属性 | 值 |
 | ---- | -- |
-| Required | SplitIncoreOrch, IncoreTileOps |
-| Produced | SplitIncoreOrch, IncoreTileOps |
+| Required | HierarchyOutlined, IncoreTileOps |
+| Produced | HierarchyOutlined, IncoreTileOps |
 | Invalidated | — |
 
 ## 关键组件
diff --git a/docs/zh-cn/dev/passes/11-flatten_tile_nd_to_2d.md b/docs/zh-cn/dev/passes/10-flatten_tile_nd_to_2d.md
similarity index 100%
rename from docs/zh-cn/dev/passes/11-flatten_tile_nd_to_2d.md
rename to docs/zh-cn/dev/passes/10-flatten_tile_nd_to_2d.md
diff --git a/docs/zh-cn/dev/passes/14-expand_mixed_kernel.md b/docs/zh-cn/dev/passes/11-expand_mixed_kernel.md
similarity index 96%
rename from docs/zh-cn/dev/passes/14-expand_mixed_kernel.md
rename to docs/zh-cn/dev/passes/11-expand_mixed_kernel.md
index 9e04fd2da..29f3d93c0 100644
--- a/docs/zh-cn/dev/passes/14-expand_mixed_kernel.md
+++ b/docs/zh-cn/dev/passes/11-expand_mixed_kernel.md
@@ -4,7 +4,7 @@
 
 ## 概述
 
-在 `OutlineIncoreScopes` 和 `ConvertTensorToTileOps` 之后，InCore 函数可能同时包含 Cube 操作（`tile.matmul`、`tile.gemv` 等）和 Vector 操作（`tile.add`、`tile.exp` 等）。部分操作如 `tile.load`、`tile.store`、`tile.move`、`tile.reshape` 根据其 tile 操作数的 MemorySpace 被分类为 Cube 或 Vector。包含两侧操作的函数是**混合 InCore 函数**。硬件要求 Cube 和 Vector 操作在不同的核心类型上运行，因此该 Pass 将它们拆分为：
+在 `OutlineHierarchyScopes` 和 `ConvertTensorToTileOps` 之后，InCore 函数可能同时包含 Cube 操作（`tile.matmul`、`tile.gemv` 等）和 Vector 操作（`tile.add`、`tile.exp` 等）。部分操作如 `tile.load`、`tile.store`、`tile.move`、`tile.reshape` 根据其 tile 操作数的 MemorySpace 被分类为 Cube 或 Vector。包含两侧操作的函数是**混合 InCore 函数**。硬件要求 Cube 和 Vector 操作在不同的核心类型上运行，因此该 Pass 将它们拆分为：
 
 - **AIC 函数**（`FunctionType::AIC`）— 仅包含 Cube + 共享操作
 - **AIV 函数**（`FunctionType::AIV`）— 仅包含 Vector + 共享操作
@@ -76,7 +76,7 @@ Ascend910B（a2a3）——跨核传输经过 GM → Mat，Mat 仅支持 NZ 布
 **前置条件**：
 
 - 输入 IR 必须具有 tile 操作（需先运行 `ConvertTensorToTileOps`）
-- 输入 IR 必须已提取 InCore 作用域（需先运行 `OutlineIncoreScopes`）
+- 输入 IR 必须已提取 Hierarchy 作用域为独立函数（需先运行 `OutlineHierarchyScopes`）
 - Tile 操作必须已展平为 2D（需先运行 `FlattenTileNdTo2D`）
 - Tile 内存空间必须已推断（需先运行 `InferTileMemorySpace`）
 - 跨核 Fractal TileView 分配在 Ascend950 和 Ascend910B 后端均受支持
@@ -289,7 +289,7 @@ class After:
 
 | 属性 | 值 |
 | ---- | -- |
-| 所需 | SSAForm, IncoreTileOps, SplitIncoreOrch, TileOps2D, TileMemoryInferred |
+| 所需 | SSAForm, IncoreTileOps, HierarchyOutlined, TileOps2D, TileMemoryInferred |
 | 产生 | SSAForm, MixedKernelExpanded |
 | 失效 | — |
 
diff --git a/docs/zh-cn/dev/passes/15-init_memref.md b/docs/zh-cn/dev/passes/12-init_memref.md
similarity index 98%
rename from docs/zh-cn/dev/passes/15-init_memref.md
rename to docs/zh-cn/dev/passes/12-init_memref.md
index 954def992..db5c04692 100644
--- a/docs/zh-cn/dev/passes/15-init_memref.md
+++ b/docs/zh-cn/dev/passes/12-init_memref.md
@@ -12,7 +12,7 @@
 
 内存空间从 `TileType::memory_space_` 读取（由 InferTileMemorySpace 设置）。无 `memory_space` 的变量默认为 DDR。
 
-**需要**：SSAForm、SplitIncoreOrch、IncoreTileOps、TileOps2D、TileMemoryInferred。
+**需要**：SSAForm、HierarchyOutlined、IncoreTileOps、TileOps2D、TileMemoryInferred。
 
 **产生**：HasMemRefs、NormalizedStmtStructure。
 
diff --git a/docs/zh-cn/dev/passes/16-memory_reuse.md b/docs/zh-cn/dev/passes/13-memory_reuse.md
similarity index 100%
rename from docs/zh-cn/dev/passes/16-memory_reuse.md
rename to docs/zh-cn/dev/passes/13-memory_reuse.md
diff --git a/docs/zh-cn/dev/passes/17-allocate_memory_addr.md b/docs/zh-cn/dev/passes/14-allocate_memory_addr.md
similarity index 100%
rename from docs/zh-cn/dev/passes/17-allocate_memory_addr.md
rename to docs/zh-cn/dev/passes/14-allocate_memory_addr.md
diff --git a/docs/zh-cn/dev/passes/20-partial_unroll_tile_loops.md b/docs/zh-cn/dev/passes/15-partial_unroll_tile_loops.md
similarity index 97%
rename from docs/zh-cn/dev/passes/20-partial_unroll_tile_loops.md
rename to docs/zh-cn/dev/passes/15-partial_unroll_tile_loops.md
index bb62dcfe9..8ed13984d 100644
--- a/docs/zh-cn/dev/passes/20-partial_unroll_tile_loops.md
+++ b/docs/zh-cn/dev/passes/15-partial_unroll_tile_loops.md
@@ -8,7 +8,7 @@
 
 `PartialUnrollTileLoops` 提供更精细的开关：在 tile 层级把循环体复制 `F` 份（典型值 2–4），保留外层 `N/F` 次顺序迭代。每个副本获得独立的定义变量（保持 SSA），各自操作独立的 tile，下游 `MemoryReuse` 无法将其合并。
 
-**前置条件**: SSAForm、SplitIncoreOrch、IncoreTileOps、TileOps2D、TileMemoryInferred、NormalizedStmtStructure。
+**前置条件**: SSAForm、HierarchyOutlined、IncoreTileOps、TileOps2D、TileMemoryInferred、NormalizedStmtStructure。
 
 **流水线位置**: 位于 `NormalizeReturnOrder` 之后、`InitMemRef` 之前（slot 20.5）。此时 tile 结构决策已完成；同时早于 `InitMemRef`/`MemoryReuse`，使其看到每个副本独立的 tile 变量。
 
@@ -157,6 +157,6 @@ else:
 
 ## 相关
 
-- [`ReorderUnrolledIO`](21-reorder_unrolled_io.md) —— 消费 `unroll_replicated` 标记
+- [`ReorderUnrolledIO`](16-reorder_unrolled_io.md) —— 消费 `unroll_replicated` 标记
 - [`UnrollLoops`](01-unroll_loops.md) —— slot #1 的全展开 Pass，仍是 `pl.unroll(N)` 的主要降级路径
 - RFC #1025 —— 设计文档
diff --git a/docs/zh-cn/dev/passes/21-reorder_unrolled_io.md b/docs/zh-cn/dev/passes/16-reorder_unrolled_io.md
similarity index 95%
rename from docs/zh-cn/dev/passes/21-reorder_unrolled_io.md
rename to docs/zh-cn/dev/passes/16-reorder_unrolled_io.md
index 35347db47..cb9053a47 100644
--- a/docs/zh-cn/dev/passes/21-reorder_unrolled_io.md
+++ b/docs/zh-cn/dev/passes/16-reorder_unrolled_io.md
@@ -14,7 +14,7 @@
 
 只要数据流允许，结果即为 `[loads…, compute…, stores…]`。各克隆的输入 tile 在顶部同时活跃，输出 tile 在底部同时活跃 —— `MemoryReuse` 无法合并它们，每个克隆保留独立的 MemRef，从而 ping-pong 缓冲成为可能。
 
-**前置条件**: SSAForm、SplitIncoreOrch、IncoreTileOps、TileOps2D、TileMemoryInferred、NormalizedStmtStructure。
+**前置条件**: SSAForm、HierarchyOutlined、IncoreTileOps、TileOps2D、TileMemoryInferred、NormalizedStmtStructure。
 
 **流水线位置**: 位于 `PartialUnrollTileLoops` 之后、`InitMemRef` 之前（slot 20.6）。在 `InitMemRef` 之前运行可保留 SSAForm，依赖分析正常工作。
 
@@ -112,7 +112,7 @@ for i in pl.range(0, 8, 4, attrs={"unroll_replicated": 4}):
 
 ## 相关
 
-- [`PartialUnrollTileLoops`](20-partial_unroll_tile_loops.md) —— 生成本 Pass 消费的 `unroll_replicated` 标记
-- [`MemoryReuse`](16-memory_reuse.md) —— 在本 Pass 之后运行；受益于同时活跃的 tile
+- [`PartialUnrollTileLoops`](15-partial_unroll_tile_loops.md) —— 生成本 Pass 消费的 `unroll_replicated` 标记
+- [`MemoryReuse`](13-memory_reuse.md) —— 在本 Pass 之后运行；受益于同时活跃的 tile
 - RFC #1025 —— 设计文档
 - RFC #1026 / PR #1029 —— InOut-use 规约 + 依赖分析工具
diff --git a/docs/zh-cn/dev/passes/99-verifier.md b/docs/zh-cn/dev/passes/99-verifier.md
index dfe280cc9..b183b25ae 100644
--- a/docs/zh-cn/dev/passes/99-verifier.md
+++ b/docs/zh-cn/dev/passes/99-verifier.md
@@ -15,7 +15,7 @@
 
 - **可插拔规则系统**：可通过自定义验证规则进行扩展
 - **基于属性的验证**：选择性属性集——精确验证所需内容
-- **结构性属性 (Structural Properties)**：TypeChecked、BreakContinueValid、NoRedundantBlocks、UseAfterDef、OutParamNotShadowed、NoNestedInCore 和 InOutUseValid 在流水线启动时由 `PassPipeline` 验证，并由 `VerificationInstrument` 在每个 Pass 执行前后验证
+- **结构性属性 (Structural Properties)**：TypeChecked、BreakContinueValid、NoRedundantBlocks、UseAfterDef、OutParamNotShadowed 和 InOutUseValid 在流水线启动时由 `PassPipeline` 验证，并由 `VerificationInstrument` 在每个 Pass 执行前后验证
 - **双重验证模式**：收集诊断信息或在首个错误时抛出异常
 - **Pass 集成**：可作为优化流水线中的 Pass 使用
 - **全面的诊断信息**：收集所有问题及源码位置
@@ -26,10 +26,10 @@
 
 | 类别 | 示例 | 行为 |
 | ---- | ---- | ---- |
-| **结构性** | TypeChecked, BreakContinueValid, NoRedundantBlocks, UseAfterDef, OutParamNotShadowed, NoNestedInCore, InOutUseValid | 始终为真。在流水线启动时验证，并由 `VerificationInstrument` 在每个 Pass 执行前后验证。不在 PassProperties 中声明。 |
+| **结构性** | TypeChecked, BreakContinueValid, NoRedundantBlocks, UseAfterDef, OutParamNotShadowed, InOutUseValid | 始终为真。在流水线启动时验证，并由 `VerificationInstrument` 在每个 Pass 执行前后验证。不在 PassProperties 中声明。 |
 | **流水线** | SSAForm, NoNestedCalls, HasMemRefs, ... | 由 Pass 产生/失效。按 Pass 声明的契约验证。 |
 
-`GetStructuralProperties()` 返回 `{TypeChecked, BreakContinueValid, NoRedundantBlocks, UseAfterDef, OutParamNotShadowed, NoNestedInCore, InOutUseValid}`。这些在 `PassPipeline::Run()` 中**于流水线启动时验证**，并由 `VerificationInstrument` **在每个 Pass 执行前后验证**。由于没有 Pass 在 `required`/`produced`/`invalidated` 中声明它们，`VerificationInstrument` 将它们与 Pass 声明的属性合并，确保没有 Pass 破坏这些基本不变量。
+`GetStructuralProperties()` 返回 `{TypeChecked, BreakContinueValid, NoRedundantBlocks, UseAfterDef, OutParamNotShadowed, InOutUseValid}`。这些在 `PassPipeline::Run()` 中**于流水线启动时验证**，并由 `VerificationInstrument` **在每个 Pass 执行前后验证**。由于没有 Pass 在 `required`/`produced`/`invalidated` 中声明它们，`VerificationInstrument` 将它们与 Pass 声明的属性合并，确保没有 Pass 破坏这些基本不变量。
 
 ### 验证规则系统
 
@@ -68,12 +68,11 @@
 | **UseAfterDefCheck** | UseAfterDef | 每个 Var 使用均由定义支配（参数、AssignStmt、循环变量、iter_arg、return_var） |
 | **NormalizedStmtStructure** | NormalizedStmtStructure | 展平嵌套 `SeqStmts` 并解包单子节点 `SeqStmts` |
 | **NoRedundantBlocks** | NoRedundantBlocks | 无单子节点或嵌套的 `SeqStmts` |
-| **SplitIncoreOrch** | SplitIncoreOrch | Opaque 函数中不残留 `InCoreScopeStmt` 节点 |
+| **HierarchyOutlined** | HierarchyOutlined | `Opaque`/`Orchestration` 函数中不残留 `HierarchyScopeStmt` 节点 |
 | **IncoreTileOps** | IncoreTileOps | InCore 函数使用 tile 操作（无张量级操作残留） |
 | **HasMemRefs** | HasMemRefs | 所有 TileType 变量已初始化 MemRef |
 | **AllocatedMemoryAddr** | AllocatedMemoryAddr | 所有 MemRef 在缓冲区限制内具有有效地址 |
 | **OutParamNotShadowed** | OutParamNotShadowed | Out/InOut 参数未被张量创建操作重新赋值 |
-| **NoNestedInCore** | NoNestedInCore | 无嵌套 InCore 作用域（`InCoreScopeStmt` 内含 `InCoreScopeStmt`） |
 | **InOutUseValid** | InOutUseValid | 作为 InOut/Out 传入用户函数调用的变量，在调用之后不得再被读取（RFC #1026）。Group 类型函数体目前跳过，待后续完善。 |
 
 ### SSAVerify
@@ -161,8 +160,8 @@
 
 | 函数 | 返回值 | 描述 |
 | ---- | ------ | ---- |
-| `GetStructuralProperties()` | `{TypeChecked, BreakContinueValid, NoRedundantBlocks, UseAfterDef, OutParamNotShadowed, NoNestedInCore, InOutUseValid}` | 在流水线启动时及每个 Pass 执行前后验证的不变量 |
-| `GetDefaultVerifyProperties()` | `{SSAForm, TypeChecked, NoNestedCalls, BreakContinueValid, NoRedundantBlocks, UseAfterDef, OutParamNotShadowed, NoNestedInCore}` | `run_verifier()` 的默认属性集 |
+| `GetStructuralProperties()` | `{TypeChecked, BreakContinueValid, NoRedundantBlocks, UseAfterDef, OutParamNotShadowed, InOutUseValid}` | 在流水线启动时及每个 Pass 执行前后验证的不变量 |
+| `GetDefaultVerifyProperties()` | `{SSAForm, TypeChecked, NoNestedCalls, BreakContinueValid, NoRedundantBlocks, UseAfterDef, OutParamNotShadowed}` | `run_verifier()` 的默认属性集 |
 | `GetVerifiedProperties()` | `{SSAForm, TypeChecked, AllocatedMemoryAddr, BreakContinueValid, NoRedundantBlocks, InOutUseValid}` | `PassPipeline` 自动验证的轻量级属性集 |
 
 ### RunVerifier Pass 工厂
diff --git a/docs/zh-cn/user/01-language_guide.md b/docs/zh-cn/user/01-language_guide.md
index e8c93a37b..2de7e516c 100644
--- a/docs/zh-cn/user/01-language_guide.md
+++ b/docs/zh-cn/user/01-language_guide.md
@@ -410,50 +410,22 @@ class Model:
 将代码区域标记为 InCore 执行，无需创建单独的函数：
 
 ```python
-# 推荐用法（新 API）：
 with pl.at(level=pl.Level.CORE_GROUP):
     y: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
-
-# 已弃用（请改用 pl.at）：
-with pl.incore():
-    y: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
 ```
 
-如需编译器驱动的 chunked 循环 outline（AutoInCore），在 `optimizations` 列表中传入
-`pl.auto_chunk`：
+`OutlineIncoreScopes` 之后会把该区域提取为 `Function(InCore)`，并把父
+`Opaque` 函数升级为 `Orchestration`。（非 CORE_GROUP 的 `pl.at(level=...)`
+区域则由先行的 `OutlineHierarchyScopes` 提取为 `Function(Opaque)`，不会
+提升父函数类型。）
 
-```python
-# 推荐用法（新 API）：
-with pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.auto_chunk]):
-    for i in pl.parallel(0, 8, 1, chunk=4):
-        x = pl.add(x, x)
-
-# 已弃用（仍可用，会触发 DeprecationWarning）：
-with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-    ...
-
-with pl.auto_incore():
-    ...
-```
-
-如需为 `ExpandMixedKernel` Pass 指定跨核 split 模式，使用 `pl.split(...)` —— 它与
-`pl.auto_chunk` 互相独立，可任意组合：
+如需为 `ExpandMixedKernel` Pass 指定跨核 split 模式，在 `optimizations`
+列表中传入 `pl.split(...)`：
 
 ```python
-# 普通 InCore + split 提示：
 with pl.at(level=pl.Level.CORE_GROUP,
            optimizations=[pl.split(pl.SplitMode.UP_DOWN)]):
     y: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
-
-# AutoInCore + split 提示（独立条目，自由组合）：
-with pl.at(level=pl.Level.CORE_GROUP,
-           optimizations=[pl.auto_chunk, pl.split(pl.SplitMode.UP_DOWN)]):
-    for i in pl.parallel(0, 8, 1, chunk=4):
-        x = pl.add(x, x)
-
-# 已弃用的单关键字形式（仍可用，会触发 DeprecationWarning）：
-with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN):
-    ...
 ```
 
 ## 内存与数据搬运
@@ -541,22 +513,28 @@ output_dir = ir.compile(
 1. **UnrollLoops** —— 展开循环迭代
 2. **CtrlFlowTransform** —— 将控制流改写为结构化 IR
 3. **ConvertToSSA** —— 转换为静态单赋值形式
-4. **FlattenCallExpr** —— 展平嵌套函数调用
-5. **SplitChunkedLoops** —— 将分块循环拆分为独立循环
-6. **InterchangeChunkLoops** —— 交换分块循环顺序
-7. **OutlineHierarchyScopes** —— 提取 hierarchy 作用域
-8. **OutlineIncoreScopes** —— 将 InCore 作用域提取为独立函数
-9. **OutlineClusterScopes** —— 提取 cluster 作用域
-10. **ConvertTensorToTileOps** —— 将张量操作转换为 tile 操作
+4. **NormalizeStmtStructure** —— 展平/解包冗余的 `SeqStmts`
+5. **FlattenCallExpr** —— 展平嵌套函数调用
+6. **OutlineHierarchyScopes** —— 将非 CORE_GROUP 的 `HierarchyScopeStmt` 区域提取为 `Function(Opaque)`
+7. **OutlineIncoreScopes** —— 将 CORE_GROUP 的 `HierarchyScopeStmt` 区域提取为 `Function(InCore)`，并把父函数升级为 `Orchestration`
+8. **OutlineClusterScopes** —— 将 cluster 作用域提取为 Group 函数
+9. **ConvertTensorToTileOps** —— 将张量操作转换为 tile 操作
+10. **OptimizeOrchTensors** —— 优化编排层张量操作
 11. **FlattenTileNdTo2D** —— 将 ND tile 操作规范化为 2D
 12. **InferTileMemorySpace** —— 推断 tile 内存空间
 13. **ResolveTransposeLayout** —— 修复转置布局处理
 14. **ResolveBackendOpLayouts** —— 修复 backend 受限的 tile 布局
 15. **ExpandMixedKernel** —— 在需要时拆分 mixed kernel
-16. **InitMemRef** —— 分配内存空间并插入缓冲区分配
-17. **MemoryReuse** —— 共享生命周期不重叠的缓冲区
-18. **LegalizePTOBufferReuse** —— 规范化 PTO 缓冲区复用模式
-19. **AllocateMemoryAddr** —— 分配具体内存地址
+16. **SplitVectorKernel** —— 在需要时拆分 vector kernel
+17. **NormalizeReturnOrder** —— 按 Out/InOut 参数顺序重排返回值
+18. **PartialUnrollTileLoops** —— 在 tile 层部分展开循环
+19. **ReorderUnrolledIO** —— 将展开副本的 load/store 分组
+20. **InitMemRef** —— 分配内存空间并插入缓冲区分配
+21. **MemoryReuse** —— 共享生命周期不重叠的缓冲区
+22. **LegalizePTOBufferReuse** —— 规范化 PTO 缓冲区复用模式
+23. **AllocateMemoryAddr** —— 分配具体内存地址
+24. **FuseCreateAssembleToSlice** —— 融合 create + assemble 操作
+25. **Simplify** —— 最终简化 Pass
 
 ### 调试
 
diff --git a/docs/zh-cn/user/02-operation_reference.md b/docs/zh-cn/user/02-operation_reference.md
index 5466e333d..dc9296348 100644
--- a/docs/zh-cn/user/02-operation_reference.md
+++ b/docs/zh-cn/user/02-operation_reference.md
@@ -210,6 +210,8 @@
 | `yield_` | `(*values: Any) -> Any \| tuple[Any, ...]` | 从 for/if 作用域 yield 值 |
 | `cond` | `(condition: bool \| Scalar) -> None` | 设置 while 循环条件（必须是第一条语句） |
 | `const` | `(value: int \| float, dtype: DataType) -> int \| float` | 类型化常量 |
-| `incore` | `() -> IncoreContext` | InCore 作用域的上下文管理器 |
+| `at` | `(*, level: Level, role: Role \| None = None, optimizations: Sequence[Optimization] \| None = None) -> AtContext` | 层级作用域的上下文管理器；`level=Level.CORE_GROUP` 即为 InCore 形式 |
+| `cluster` | `() -> ClusterContext` | cluster（AIC+AIV）作用域的上下文管理器 |
+| `spmd` | `(*, core_num: int \| Scalar, sync_start: bool = False) -> SpmdContext` | standalone SPMD 启动作用域的上下文管理器 |
 | `dynamic` | `(name: str) -> DynVar` | 创建动态维度变量 |
 | `create_tensor` | `(shape: Sequence[IntLike], dtype: DataType) -> Tensor` | 创建张量（从 `pl.tensor` 提升） |
diff --git a/include/pypto/ir/core.h b/include/pypto/ir/core.h
index a247f3517..8f0f4303e 100644
--- a/include/pypto/ir/core.h
+++ b/include/pypto/ir/core.h
@@ -89,9 +89,7 @@ enum class ObjectKind {
   ReturnStmt,
   ForStmt,
   WhileStmt,
-  // Scope statement kinds (split from former ScopeStmt — see issue #1047)
-  InCoreScopeStmt,
-  AutoInCoreScopeStmt,
+  // Scope statement kinds (typed hierarchy — see issue #1047)
   ClusterScopeStmt,
   HierarchyScopeStmt,
   SpmdScopeStmt,
diff --git a/include/pypto/ir/kind_traits.h b/include/pypto/ir/kind_traits.h
index cfd7adcd1..de1961233 100644
--- a/include/pypto/ir/kind_traits.h
+++ b/include/pypto/ir/kind_traits.h
@@ -88,8 +88,6 @@ DEFINE_KIND_TRAIT(YieldStmt, ObjectKind::YieldStmt)
 DEFINE_KIND_TRAIT(ReturnStmt, ObjectKind::ReturnStmt)
 DEFINE_KIND_TRAIT(ForStmt, ObjectKind::ForStmt)
 DEFINE_KIND_TRAIT(WhileStmt, ObjectKind::WhileStmt)
-DEFINE_KIND_TRAIT(InCoreScopeStmt, ObjectKind::InCoreScopeStmt)
-DEFINE_KIND_TRAIT(AutoInCoreScopeStmt, ObjectKind::AutoInCoreScopeStmt)
 DEFINE_KIND_TRAIT(ClusterScopeStmt, ObjectKind::ClusterScopeStmt)
 DEFINE_KIND_TRAIT(HierarchyScopeStmt, ObjectKind::HierarchyScopeStmt)
 DEFINE_KIND_TRAIT(SpmdScopeStmt, ObjectKind::SpmdScopeStmt)
@@ -127,19 +125,17 @@ struct KindTrait<Stmt> {
   static constexpr ObjectKind kinds[] = {ObjectKind::AssignStmt,       ObjectKind::IfStmt,
                                          ObjectKind::YieldStmt,        ObjectKind::ReturnStmt,
                                          ObjectKind::ForStmt,          ObjectKind::WhileStmt,
-                                         ObjectKind::InCoreScopeStmt,  ObjectKind::AutoInCoreScopeStmt,
                                          ObjectKind::ClusterScopeStmt, ObjectKind::HierarchyScopeStmt,
                                          ObjectKind::SpmdScopeStmt,    ObjectKind::SeqStmts,
                                          ObjectKind::EvalStmt,         ObjectKind::BreakStmt,
                                          ObjectKind::ContinueStmt};
-  static constexpr size_t count = 15;
+  static constexpr size_t count = 13;
 };
 
-// ScopeStmt base class - matches any scope kind (5 derived classes)
+// ScopeStmt base class - matches any scope kind (3 derived classes)
 template <>
 struct KindTrait<ScopeStmt> {
-  static constexpr ObjectKind kinds[] = {ObjectKind::InCoreScopeStmt, ObjectKind::AutoInCoreScopeStmt,
-                                         ObjectKind::ClusterScopeStmt, ObjectKind::HierarchyScopeStmt,
+  static constexpr ObjectKind kinds[] = {ObjectKind::ClusterScopeStmt, ObjectKind::HierarchyScopeStmt,
                                          ObjectKind::SpmdScopeStmt};
   static constexpr size_t count = sizeof(kinds) / sizeof(ObjectKind);
 };
diff --git a/include/pypto/ir/stmt.h b/include/pypto/ir/stmt.h
index 107834763..85073e3ee 100644
--- a/include/pypto/ir/stmt.h
+++ b/include/pypto/ir/stmt.h
@@ -107,13 +107,15 @@ struct ChunkConfig {
 /**
  * @brief Loop origin classification for tracking how a loop was generated
  *
- * Used by SplitChunkedLoops to tag each generated loop with its origin.
+ * The Chunk* values were originally produced by the deleted SplitChunkedLoops
+ * pass; they remain bound for user-visible attrs but no built-in pass currently
+ * emits them.
  */
 enum class LoopOrigin : uint8_t {
   Original = 0,       ///< Regular loop (default)
-  ChunkOuter = 1,     ///< Outer loop from chunk splitting
-  ChunkInner = 2,     ///< Inner loop from chunk splitting
-  ChunkRemainder = 3  ///< Remainder loop from chunk splitting
+  ChunkOuter = 1,     ///< Outer loop from chunk splitting (no producer pass — user attr only)
+  ChunkInner = 2,     ///< Inner loop from chunk splitting (no producer pass — user attr only)
+  ChunkRemainder = 3  ///< Remainder loop from chunk splitting (no producer pass — user attr only)
 };
 
 /**
@@ -156,11 +158,9 @@ inline LoopOrigin StringToLoopOrigin(const std::string& str) {
  * @brief Distinguishes different scope kinds
  */
 enum class ScopeKind : uint8_t {
-  InCore = 0,      ///< InCore scope for AICore sub-graphs
-  AutoInCore = 1,  ///< AutoInCore scope for automatic chunking
-  Cluster = 2,     ///< Cluster scope for co-scheduled AIC + AIV groups
-  Hierarchy = 3,   ///< Distributed hierarchy scope (uses level_/role_ on ScopeStmt)
-  Spmd = 4         ///< SPMD dispatch scope (core_num/sync_start on ScopeStmt)
+  Cluster = 0,    ///< Cluster scope for co-scheduled AIC + AIV groups
+  Hierarchy = 1,  ///< Distributed hierarchy scope (uses level_/role_/split_ on ScopeStmt)
+  Spmd = 2        ///< SPMD dispatch scope (core_num/sync_start on ScopeStmt)
 };
 
 /**
@@ -250,14 +250,10 @@ inline ForKind StringToForKind(const std::string& str) {
 /**
  * @brief Convert ScopeKind to string
  * @param kind The scope kind
- * @return String representation ("InCore", "AutoInCore", "Cluster", "Hierarchy", or "Spmd")
+ * @return String representation ("Cluster", "Hierarchy", or "Spmd")
  */
 inline std::string ScopeKindToString(ScopeKind kind) {
   switch (kind) {
-    case ScopeKind::InCore:
-      return "InCore";
-    case ScopeKind::AutoInCore:
-      return "AutoInCore";
     case ScopeKind::Cluster:
       return "Cluster";
     case ScopeKind::Hierarchy:
@@ -275,11 +271,7 @@ inline std::string ScopeKindToString(ScopeKind kind) {
  * @throws pypto::TypeError if string is not recognized
  */
 inline ScopeKind StringToScopeKind(const std::string& str) {
-  if (str == "InCore") {
-    return ScopeKind::InCore;
-  } else if (str == "AutoInCore") {
-    return ScopeKind::AutoInCore;
-  } else if (str == "Cluster") {
+  if (str == "Cluster") {
     return ScopeKind::Cluster;
   } else if (str == "Hierarchy") {
     return ScopeKind::Hierarchy;
@@ -702,32 +694,33 @@ using WhileStmtPtr = std::shared_ptr<const WhileStmt>;
  * Represents a scoped region of code with a specific execution context.
  * This is NOT a control flow node — it executes its body exactly once, linearly.
  *
- * **Class hierarchy** (issue #1047):
+ * **Class hierarchy:**
  * - `ScopeStmt` (abstract): common fields `name_hint_`, `body_`
- *   - `InCoreScopeStmt`: optional `split_`
- *   - `AutoInCoreScopeStmt`: optional `split_`
  *   - `ClusterScopeStmt`: no extra fields
- *   - `HierarchyScopeStmt`: required `level_`, optional `role_`
+ *   - `HierarchyScopeStmt`: required `level_`, optional `role_`, optional `split_`
+ *     (split_ is only valid at Level::CORE_GROUP)
  *   - `SpmdScopeStmt`: required `core_num_`, `sync_start_` (default false)
  *
  * **Syntax:**
- * with pl.incore():    # InCore scope -> InCoreScopeStmt
+ * with pl.cluster():                                     # -> ClusterScopeStmt
  *     body
- * with pl.cluster():   # Cluster scope -> ClusterScopeStmt
+ * with pl.at(level=pl.Level.CORE_GROUP):                 # -> HierarchyScopeStmt
  *     body
  * with pl.at(level=pl.Level.HOST, role=pl.Role.Worker):  # -> HierarchyScopeStmt
  *     body
- * with pl.spmd(core_num=8):  # -> SpmdScopeStmt
+ * with pl.spmd(core_num=8):                              # -> SpmdScopeStmt
  *     body
  *
  * **Semantics:**
- * - Marks a region of code as belonging to a specific scope (e.g., InCore, Cluster)
- * - Executes body exactly once (no iteration, no branching)
- * - Variables flow through transparently (no iter_args/return_vars needed)
- * - SSA conversion treats it as transparent (just visits body)
- * - OutlineIncoreScopes extracts InCore scopes into InCore functions
- * - OutlineClusterScopes extracts Cluster scopes into Group functions
- * - Hierarchy scopes are outlined into level-/role-annotated functions
+ * - Marks a region of code as belonging to a specific scope.
+ * - Executes body exactly once (no iteration, no branching).
+ * - Variables flow through transparently (no iter_args/return_vars needed).
+ * - SSA conversion treats it as transparent (just visits body).
+ * - OutlineHierarchyScopes extracts Hierarchy scopes into level-/role-annotated
+ *   functions. For Level::CORE_GROUP, the outlined function has
+ *   FunctionType::InCore and the parent function is promoted to
+ *   FunctionType::Orchestration.
+ * - OutlineClusterScopes extracts Cluster scopes into Group functions.
  */
 class ScopeStmt : public Stmt {
  public:
@@ -754,60 +747,6 @@ class ScopeStmt : public Stmt {
 
 using ScopeStmtPtr = std::shared_ptr<const ScopeStmt>;
 
-/**
- * @brief InCore scope: AICore sub-graph region.
- *
- * Carries an optional `split` for cross-core transfer mode.
- */
-class InCoreScopeStmt : public ScopeStmt {
- public:
-  InCoreScopeStmt(std::optional<SplitMode> split, std::string name_hint, StmtPtr body, Span span,
-                  std::vector<std::string> leading_comments = {})
-      : ScopeStmt(std::move(name_hint), std::move(body), std::move(span), std::move(leading_comments)),
-        split_(split) {}
-
-  [[nodiscard]] ObjectKind GetKind() const override { return ObjectKind::InCoreScopeStmt; }
-  [[nodiscard]] ScopeKind GetScopeKind() const override { return ScopeKind::InCore; }
-  [[nodiscard]] std::string TypeName() const override { return "InCoreScopeStmt"; }
-
-  static constexpr auto GetFieldDescriptors() {
-    return std::tuple_cat(ScopeStmt::GetFieldDescriptors(),
-                          std::make_tuple(reflection::UsualField(&InCoreScopeStmt::split_, "split")));
-  }
-
- public:
-  std::optional<SplitMode> split_;  // Split mode (nullopt or None for no split)
-};
-
-using InCoreScopeStmtPtr = std::shared_ptr<const InCoreScopeStmt>;
-
-/**
- * @brief AutoInCore scope: InCore region with automatic chunking.
- *
- * Carries an optional `split` for cross-core transfer mode.
- */
-class AutoInCoreScopeStmt : public ScopeStmt {
- public:
-  AutoInCoreScopeStmt(std::optional<SplitMode> split, std::string name_hint, StmtPtr body, Span span,
-                      std::vector<std::string> leading_comments = {})
-      : ScopeStmt(std::move(name_hint), std::move(body), std::move(span), std::move(leading_comments)),
-        split_(split) {}
-
-  [[nodiscard]] ObjectKind GetKind() const override { return ObjectKind::AutoInCoreScopeStmt; }
-  [[nodiscard]] ScopeKind GetScopeKind() const override { return ScopeKind::AutoInCore; }
-  [[nodiscard]] std::string TypeName() const override { return "AutoInCoreScopeStmt"; }
-
-  static constexpr auto GetFieldDescriptors() {
-    return std::tuple_cat(ScopeStmt::GetFieldDescriptors(),
-                          std::make_tuple(reflection::UsualField(&AutoInCoreScopeStmt::split_, "split")));
-  }
-
- public:
-  std::optional<SplitMode> split_;  // Split mode (nullopt or None for no split)
-};
-
-using AutoInCoreScopeStmtPtr = std::shared_ptr<const AutoInCoreScopeStmt>;
-
 /**
  * @brief Cluster scope: co-scheduled AIC + AIV group.
  *
@@ -831,15 +770,18 @@ using ClusterScopeStmtPtr = std::shared_ptr<const ClusterScopeStmt>;
 /**
  * @brief Hierarchy scope: distributed-hierarchy region.
  *
- * Required `level`, optional `role`. Outlined into level-/role-annotated functions.
+ * Required `level`, optional `role`, optional `split`. `split` is only valid
+ * when `level == Level::CORE_GROUP`; it carries the AIC/AIV cross-core split
+ * mode through to the outlined InCore function.
  */
 class HierarchyScopeStmt : public ScopeStmt {
  public:
-  HierarchyScopeStmt(Level level, std::optional<Role> role, std::string name_hint, StmtPtr body, Span span,
-                     std::vector<std::string> leading_comments = {})
-      : ScopeStmt(std::move(name_hint), std::move(body), std::move(span), std::move(leading_comments)),
-        level_(level),
-        role_(role) {}
+  // Out-of-line definition in src/ir/stmt.cpp so the CORE_GROUP check can see
+  // the full Level enum (function.h is not included from this header to avoid
+  // a circular dependency).
+  HierarchyScopeStmt(Level level, std::optional<Role> role, std::optional<SplitMode> split,
+                     std::string name_hint, StmtPtr body, Span span,
+                     std::vector<std::string> leading_comments = {});
 
   [[nodiscard]] ObjectKind GetKind() const override { return ObjectKind::HierarchyScopeStmt; }
   [[nodiscard]] ScopeKind GetScopeKind() const override { return ScopeKind::Hierarchy; }
@@ -848,12 +790,14 @@ class HierarchyScopeStmt : public ScopeStmt {
   static constexpr auto GetFieldDescriptors() {
     return std::tuple_cat(ScopeStmt::GetFieldDescriptors(),
                           std::make_tuple(reflection::UsualField(&HierarchyScopeStmt::level_, "level"),
-                                          reflection::UsualField(&HierarchyScopeStmt::role_, "role")));
+                                          reflection::UsualField(&HierarchyScopeStmt::role_, "role"),
+                                          reflection::UsualField(&HierarchyScopeStmt::split_, "split")));
   }
 
  public:
-  Level level_;               ///< Hierarchy level (required)
-  std::optional<Role> role_;  ///< Function role (Orchestrator or Worker)
+  Level level_;                     ///< Hierarchy level (required)
+  std::optional<Role> role_;        ///< Function role (Orchestrator or Worker)
+  std::optional<SplitMode> split_;  ///< AIC/AIV split mode (only valid at CORE_GROUP)
 };
 
 using HierarchyScopeStmtPtr = std::shared_ptr<const HierarchyScopeStmt>;
diff --git a/include/pypto/ir/transforms/base/functor.h b/include/pypto/ir/transforms/base/functor.h
index 3216913bd..ceeb01b87 100644
--- a/include/pypto/ir/transforms/base/functor.h
+++ b/include/pypto/ir/transforms/base/functor.h
@@ -186,8 +186,6 @@ class StmtFunctor {
   virtual R VisitStmt_(const ReturnStmtPtr& op, Args... args) = 0;
   virtual R VisitStmt_(const ForStmtPtr& op, Args... args) = 0;
   virtual R VisitStmt_(const WhileStmtPtr& op, Args... args) = 0;
-  virtual R VisitStmt_(const InCoreScopeStmtPtr& op, Args... args) = 0;
-  virtual R VisitStmt_(const AutoInCoreScopeStmtPtr& op, Args... args) = 0;
   virtual R VisitStmt_(const ClusterScopeStmtPtr& op, Args... args) = 0;
   virtual R VisitStmt_(const HierarchyScopeStmtPtr& op, Args... args) = 0;
   virtual R VisitStmt_(const SpmdScopeStmtPtr& op, Args... args) = 0;
@@ -213,8 +211,6 @@ R StmtFunctor<R, Args...>::VisitStmt(const StmtPtr& stmt, Args... args) {
   STMT_FUNCTOR_DISPATCH(ReturnStmt);
   STMT_FUNCTOR_DISPATCH(ForStmt);
   STMT_FUNCTOR_DISPATCH(WhileStmt);
-  STMT_FUNCTOR_DISPATCH(InCoreScopeStmt);
-  STMT_FUNCTOR_DISPATCH(AutoInCoreScopeStmt);
   STMT_FUNCTOR_DISPATCH(ClusterScopeStmt);
   STMT_FUNCTOR_DISPATCH(HierarchyScopeStmt);
   STMT_FUNCTOR_DISPATCH(SpmdScopeStmt);
diff --git a/include/pypto/ir/transforms/base/mutator.h b/include/pypto/ir/transforms/base/mutator.h
index 69ccc051c..712db545d 100644
--- a/include/pypto/ir/transforms/base/mutator.h
+++ b/include/pypto/ir/transforms/base/mutator.h
@@ -95,8 +95,6 @@ class IRMutator : public ExprFunctor<ExprPtr>, public StmtFunctor<StmtPtr> {
   StmtPtr VisitStmt_(const ReturnStmtPtr& op) override;
   StmtPtr VisitStmt_(const ForStmtPtr& op) override;
   StmtPtr VisitStmt_(const WhileStmtPtr& op) override;
-  StmtPtr VisitStmt_(const InCoreScopeStmtPtr& op) override;
-  StmtPtr VisitStmt_(const AutoInCoreScopeStmtPtr& op) override;
   StmtPtr VisitStmt_(const ClusterScopeStmtPtr& op) override;
   StmtPtr VisitStmt_(const HierarchyScopeStmtPtr& op) override;
   StmtPtr VisitStmt_(const SpmdScopeStmtPtr& op) override;
diff --git a/include/pypto/ir/transforms/base/visitor.h b/include/pypto/ir/transforms/base/visitor.h
index 41d4b5a7c..5f24c47c3 100644
--- a/include/pypto/ir/transforms/base/visitor.h
+++ b/include/pypto/ir/transforms/base/visitor.h
@@ -98,8 +98,6 @@ class IRVisitor : public IRFunctor<void> {
   void VisitStmt_(const ReturnStmtPtr& op) override;
   void VisitStmt_(const ForStmtPtr& op) override;
   void VisitStmt_(const WhileStmtPtr& op) override;
-  void VisitStmt_(const InCoreScopeStmtPtr& op) override;
-  void VisitStmt_(const AutoInCoreScopeStmtPtr& op) override;
   void VisitStmt_(const ClusterScopeStmtPtr& op) override;
   void VisitStmt_(const HierarchyScopeStmtPtr& op) override;
   void VisitStmt_(const SpmdScopeStmtPtr& op) override;
diff --git a/include/pypto/ir/transforms/ir_property.h b/include/pypto/ir/transforms/ir_property.h
index a2b62dc48..6c3acba08 100644
--- a/include/pypto/ir/transforms/ir_property.h
+++ b/include/pypto/ir/transforms/ir_property.h
@@ -35,7 +35,6 @@ enum class IRProperty : uint64_t {
   NoNestedCalls,            ///< No nested call expressions
   NormalizedStmtStructure,  ///< Statement structure normalized
   NoRedundantBlocks,        ///< No single-child or nested SeqStmts
-  SplitIncoreOrch,          ///< InCore scopes outlined into separate functions
   HasMemRefs,               ///< MemRef objects initialized on variables
   IncoreTileOps,            ///< InCore functions use tile ops (tile types, load/store)
   AllocatedMemoryAddr,      ///< All MemRefs have valid addresses within buffer limits
@@ -45,11 +44,10 @@ enum class IRProperty : uint64_t {
   TileMemoryInferred,       ///< TileType memory_space_ populated in InCore functions
   BreakContinueValid,       ///< Break/continue only in sequential/while loops
   UseAfterDef,              ///< All variable uses are dominated by a definition
-  HierarchyOutlined,        ///< Hierarchy scopes outlined into level/role functions
+  HierarchyOutlined,        ///< Hierarchy scopes outlined into level/role functions (CORE_GROUP→InCore funcs)
   StructuredCtrlFlow,       ///< No BreakStmt/ContinueStmt — only structured control flow
   VectorKernelSplit,        ///< AIV functions with split mode have tpop shapes and store offsets adjusted
   OutParamNotShadowed,      ///< Out/InOut params are not reassigned with tensor-creating ops
-  NoNestedInCore,           ///< No nested InCore scopes (ScopeStmt inside ScopeStmt)
   InOutUseValid,            ///< No reads of InOut/Out-passed variables after the call (RFC #1026)
   kCount                    ///< Sentinel (must be last)
 };
@@ -191,7 +189,7 @@ const IRPropertySet& GetVerifiedProperties();
  *
  * These are verified automatically at pipeline start and never declared
  * in per-pass PassProperties. Returns {TypeChecked, BreakContinueValid,
- * NoRedundantBlocks, UseAfterDef, OutParamNotShadowed, NoNestedInCore}.
+ * NoRedundantBlocks, UseAfterDef, OutParamNotShadowed}.
  */
 const IRPropertySet& GetStructuralProperties();
 
@@ -199,7 +197,7 @@ const IRPropertySet& GetStructuralProperties();
  * @brief Default property set for explicit verification
  *
  * Returns {SSAForm, TypeChecked, NoNestedCalls, BreakContinueValid,
- * NoRedundantBlocks, UseAfterDef, OutParamNotShadowed, NoNestedInCore} — the properties checked by
+ * NoRedundantBlocks, UseAfterDef, OutParamNotShadowed} — the properties checked by
  * run_verifier() when no explicit set is given.
  */
 const IRPropertySet& GetDefaultVerifyProperties();
diff --git a/include/pypto/ir/transforms/pass_properties.h b/include/pypto/ir/transforms/pass_properties.h
index 8c5063347..9e092e3da 100644
--- a/include/pypto/ir/transforms/pass_properties.h
+++ b/include/pypto/ir/transforms/pass_properties.h
@@ -32,18 +32,6 @@ inline const PassProperties kUnrollLoopsProperties{};
 
 inline const PassProperties kCtrlFlowTransformProperties{.produced = {IRProperty::StructuredCtrlFlow}};
 
-// -- Loop chunking pass (runs after SSA) --------------------------------------
-
-inline const PassProperties kSplitChunkedLoopsProperties{
-    .required = {IRProperty::SSAForm, IRProperty::NormalizedStmtStructure},
-    .produced = {IRProperty::SSAForm, IRProperty::NormalizedStmtStructure}};
-
-// -- Chunk loop interchange pass (runs after SplitChunkedLoops) ---------------
-
-inline const PassProperties kInterchangeChunkLoopsProperties{
-    .required = {IRProperty::SSAForm, IRProperty::NormalizedStmtStructure},
-    .produced = {IRProperty::SSAForm, IRProperty::NormalizedStmtStructure}};
-
 // -- SSA conversion pass ------------------------------------------------------
 
 inline const PassProperties kConvertToSSAProperties{.produced = {IRProperty::SSAForm},
@@ -62,32 +50,40 @@ inline const PassProperties kNormalizeStmtStructureProperties{
 
 inline const PassProperties kSimplifyProperties{};
 
-// -- Outlining pass -----------------------------------------------------------
-
-inline const PassProperties kOutlineIncoreScopesProperties{
-    .required = {IRProperty::SSAForm}, .produced = {IRProperty::SSAForm, IRProperty::SplitIncoreOrch}};
-
 // -- Cluster outlining pass ---------------------------------------------------
 
 inline const PassProperties kOutlineClusterScopesProperties{
     .required = {IRProperty::SSAForm}, .produced = {IRProperty::SSAForm, IRProperty::ClusterOutlined}};
 
-// -- Hierarchy outlining pass -------------------------------------------------
+// -- Hierarchy outlining passes -----------------------------------------------
+//
+// Hierarchy outlining is split between two passes that share the
+// `HierarchyOutlined` property:
+//   - OutlineHierarchyScopes outlines every HierarchyScopeStmt with
+//     `level_ != CORE_GROUP` into Opaque functions. CORE_GROUP scopes are
+//     preserved verbatim for the next pass.
+//   - OutlineIncoreScopes outlines the remaining CORE_GROUP HierarchyScopeStmts
+//     into InCore functions and promotes the parent function from Opaque to
+//     Orchestration. It produces `HierarchyOutlined` (no Hierarchy scopes
+//     remain in Opaque/Orchestration bodies after both passes have run).
+
+inline const PassProperties kOutlineHierarchyScopesProperties{.required = {IRProperty::SSAForm},
+                                                              .produced = {IRProperty::SSAForm}};
 
-inline const PassProperties kOutlineHierarchyScopesProperties{
+inline const PassProperties kOutlineIncoreScopesProperties{
     .required = {IRProperty::SSAForm}, .produced = {IRProperty::SSAForm, IRProperty::HierarchyOutlined}};
 
 // -- Tensor-to-tile conversion pass ------------------------------------------
 
 inline const PassProperties kConvertTensorToTileOpsProperties{
-    .required = {IRProperty::SSAForm, IRProperty::SplitIncoreOrch, IRProperty::NormalizedStmtStructure},
+    .required = {IRProperty::SSAForm, IRProperty::HierarchyOutlined, IRProperty::NormalizedStmtStructure},
     .produced = {IRProperty::SSAForm, IRProperty::IncoreTileOps, IRProperty::NormalizedStmtStructure}};
 
 // -- Orchestration tensor optimization pass -----------------------------------
 
 inline const PassProperties kOptimizeOrchTensorsProperties{
-    .required = {IRProperty::SplitIncoreOrch, IRProperty::IncoreTileOps},
-    .produced = {IRProperty::SplitIncoreOrch, IRProperty::IncoreTileOps}};
+    .required = {IRProperty::HierarchyOutlined, IRProperty::IncoreTileOps},
+    .produced = {IRProperty::HierarchyOutlined, IRProperty::IncoreTileOps}};
 
 // -- Tile ND-to-2D flattening pass --------------------------------------------
 
@@ -98,31 +94,31 @@ inline const PassProperties kFlattenTileNdTo2DProperties{
 // -- Tile memory space inference pass -----------------------------------------
 
 inline const PassProperties kInferTileMemorySpaceProperties{
-    .required = {IRProperty::SSAForm, IRProperty::IncoreTileOps, IRProperty::SplitIncoreOrch,
+    .required = {IRProperty::SSAForm, IRProperty::IncoreTileOps, IRProperty::HierarchyOutlined,
                  IRProperty::NormalizedStmtStructure},
     .produced = {IRProperty::SSAForm, IRProperty::TileMemoryInferred, IRProperty::NormalizedStmtStructure}};
 
 // -- Resolve transpose layout pass --------------------------------------------
 
 inline const PassProperties kResolveTransposeLayoutProperties{
-    .required = {IRProperty::SSAForm, IRProperty::IncoreTileOps, IRProperty::SplitIncoreOrch,
+    .required = {IRProperty::SSAForm, IRProperty::IncoreTileOps, IRProperty::HierarchyOutlined,
                  IRProperty::TileOps2D},
-    .produced = {IRProperty::SSAForm, IRProperty::IncoreTileOps, IRProperty::SplitIncoreOrch,
+    .produced = {IRProperty::SSAForm, IRProperty::IncoreTileOps, IRProperty::HierarchyOutlined,
                  IRProperty::TileOps2D}};
 
 // -- Resolve backend op layouts pass ------------------------------------------
 
 inline const PassProperties kResolveBackendOpLayoutsProperties{
-    .required = {IRProperty::SSAForm, IRProperty::IncoreTileOps, IRProperty::SplitIncoreOrch,
+    .required = {IRProperty::SSAForm, IRProperty::IncoreTileOps, IRProperty::HierarchyOutlined,
                  IRProperty::TileOps2D},
-    .produced = {IRProperty::SSAForm, IRProperty::IncoreTileOps, IRProperty::SplitIncoreOrch,
+    .produced = {IRProperty::SSAForm, IRProperty::IncoreTileOps, IRProperty::HierarchyOutlined,
                  IRProperty::TileOps2D},
     .invalidated = {IRProperty::NormalizedStmtStructure}};
 
 // -- Mixed kernel expansion pass ----------------------------------------------
 
 inline const PassProperties kExpandMixedKernelProperties{
-    .required = {IRProperty::SSAForm, IRProperty::IncoreTileOps, IRProperty::SplitIncoreOrch,
+    .required = {IRProperty::SSAForm, IRProperty::IncoreTileOps, IRProperty::HierarchyOutlined,
                  IRProperty::TileOps2D, IRProperty::TileMemoryInferred, IRProperty::NormalizedStmtStructure},
     .produced = {IRProperty::SSAForm, IRProperty::MixedKernelExpanded, IRProperty::NormalizedStmtStructure}};
 
@@ -135,42 +131,42 @@ inline const PassProperties kSplitVectorKernelProperties{
 // -- Memory / codegen passes --------------------------------------------------
 
 inline const PassProperties kInitMemRefProperties{
-    .required = {IRProperty::SSAForm, IRProperty::SplitIncoreOrch, IRProperty::IncoreTileOps,
+    .required = {IRProperty::SSAForm, IRProperty::HierarchyOutlined, IRProperty::IncoreTileOps,
                  IRProperty::TileOps2D, IRProperty::TileMemoryInferred},
     .produced = {IRProperty::HasMemRefs, IRProperty::NormalizedStmtStructure},
     .invalidated = {IRProperty::SSAForm}};
 
 inline const PassProperties kMemoryReuseProperties{
-    .required = {IRProperty::SplitIncoreOrch, IRProperty::IncoreTileOps, IRProperty::HasMemRefs,
+    .required = {IRProperty::HierarchyOutlined, IRProperty::IncoreTileOps, IRProperty::HasMemRefs,
                  IRProperty::TileOps2D, IRProperty::NormalizedStmtStructure},
     .produced = {IRProperty::NormalizedStmtStructure}};
 
 inline const PassProperties kInsertSyncProperties{
-    .required = {IRProperty::SplitIncoreOrch, IRProperty::IncoreTileOps, IRProperty::HasMemRefs,
+    .required = {IRProperty::HierarchyOutlined, IRProperty::IncoreTileOps, IRProperty::HasMemRefs,
                  IRProperty::TileOps2D}};
 
 inline const PassProperties kAllocateMemoryAddrProperties{
-    .required = {IRProperty::SplitIncoreOrch, IRProperty::IncoreTileOps, IRProperty::HasMemRefs,
+    .required = {IRProperty::HierarchyOutlined, IRProperty::IncoreTileOps, IRProperty::HasMemRefs,
                  IRProperty::TileOps2D},
     .produced = {IRProperty::AllocatedMemoryAddr}};
 
 // -- Return order normalization pass ------------------------------------------
 
 inline const PassProperties kNormalizeReturnOrderProperties{
-    .required = {IRProperty::SplitIncoreOrch, IRProperty::IncoreTileOps}};
+    .required = {IRProperty::HierarchyOutlined, IRProperty::IncoreTileOps}};
 
 // -- Partial unroll + reorder passes (tile-level, before InitMemRef) ---------
 
 inline const PassProperties kPartialUnrollTileLoopsProperties{
-    .required = {IRProperty::SSAForm, IRProperty::SplitIncoreOrch, IRProperty::IncoreTileOps,
+    .required = {IRProperty::SSAForm, IRProperty::HierarchyOutlined, IRProperty::IncoreTileOps,
                  IRProperty::TileOps2D, IRProperty::TileMemoryInferred, IRProperty::NormalizedStmtStructure},
-    .produced = {IRProperty::SSAForm, IRProperty::SplitIncoreOrch, IRProperty::IncoreTileOps,
+    .produced = {IRProperty::SSAForm, IRProperty::HierarchyOutlined, IRProperty::IncoreTileOps,
                  IRProperty::TileOps2D, IRProperty::TileMemoryInferred, IRProperty::NormalizedStmtStructure}};
 
 inline const PassProperties kReorderUnrolledIOProperties{
-    .required = {IRProperty::SSAForm, IRProperty::SplitIncoreOrch, IRProperty::IncoreTileOps,
+    .required = {IRProperty::SSAForm, IRProperty::HierarchyOutlined, IRProperty::IncoreTileOps,
                  IRProperty::TileOps2D, IRProperty::TileMemoryInferred, IRProperty::NormalizedStmtStructure},
-    .produced = {IRProperty::SSAForm, IRProperty::SplitIncoreOrch, IRProperty::IncoreTileOps,
+    .produced = {IRProperty::SSAForm, IRProperty::HierarchyOutlined, IRProperty::IncoreTileOps,
                  IRProperty::TileOps2D, IRProperty::TileMemoryInferred, IRProperty::NormalizedStmtStructure}};
 
 }  // namespace pass
diff --git a/include/pypto/ir/transforms/passes.h b/include/pypto/ir/transforms/passes.h
index abbf21f1b..400846a09 100644
--- a/include/pypto/ir/transforms/passes.h
+++ b/include/pypto/ir/transforms/passes.h
@@ -182,25 +182,6 @@ Pass InsertSync();
  */
 Pass AllocateMemoryAddr();
 
-/**
- * @brief Create a loop chunking pass
- *
- * Splits ForStmt nodes with chunk_size into nested loops: an outer loop
- * iterating over chunk indices and an inner loop iterating within each chunk.
- * Requires SSA form input and produces SSA form output.
- */
-Pass SplitChunkedLoops();
-
-/**
- * @brief Interchange chunk loops and insert InCore scopes
- *
- * Reorders nested ChunkOuter/ChunkInner loop pairs so that all outer loops
- * are on top, then wraps the inner loops + body in a ScopeStmt(InCore).
- * Only interchanges when all ChunkInner loops are Parallel.
- * Requires SSA form input and produces SSA form output.
- */
-Pass InterchangeChunkLoops();
-
 /**
  * @brief Create a loop unrolling pass
  *
@@ -263,23 +244,35 @@ Pass CtrlFlowTransform();
 Pass ConvertToSSA();
 
 /**
- * @brief Outline InCore scopes into separate functions
+ * @brief Outline non-CORE_GROUP Hierarchy scopes into separate Opaque functions
+ *
+ * Outlines every `HierarchyScopeStmt` whose `level_` is anything other than
+ * `CORE_GROUP`, carrying the scope's level/role onto the outlined function. The
+ * parent function's type is preserved (it stays `Opaque`). CORE_GROUP scopes
+ * survive this pass for `OutlineIncoreScopes` to handle.
  *
  * Requirements:
  * - Input IR must be in SSA form (run ConvertToSSA first)
- * - Only processes Opaque functions
+ * - Only processes Opaque functions containing Hierarchy scopes
+ * - Should run before OutlineIncoreScopes and OutlineClusterScopes
  */
-Pass OutlineIncoreScopes();
+Pass OutlineHierarchyScopes();
 
 /**
- * @brief Outline Hierarchy scopes into separate functions with level/role
+ * @brief Outline CORE_GROUP Hierarchy scopes into InCore functions
+ *
+ * Outlines every `HierarchyScopeStmt(level=CORE_GROUP)` into a separate
+ * `Function(InCore)` and promotes the parent function from `Opaque` to
+ * `Orchestration` when any CORE_GROUP scope was outlined. Together with
+ * `OutlineHierarchyScopes`, establishes the `HierarchyOutlined` property: no
+ * `HierarchyScopeStmt` remains in any Opaque/Orchestration function body.
  *
  * Requirements:
  * - Input IR must be in SSA form (run ConvertToSSA first)
- * - Only processes Opaque functions containing Hierarchy scopes
- * - Should run before OutlineIncoreScopes and OutlineClusterScopes
+ * - Should run after OutlineHierarchyScopes and before OutlineClusterScopes
+ * - Only processes Opaque functions
  */
-Pass OutlineHierarchyScopes();
+Pass OutlineIncoreScopes();
 
 /**
  * @brief Outline Cluster scopes into separate Group functions
@@ -298,7 +291,7 @@ Pass OutlineClusterScopes();
  * orchestration call sites with tensor.create for output parameters.
  *
  * Requirements:
- * - Input IR must have InCore scopes outlined (run OutlineIncoreScopes first)
+ * - Input IR must have InCore functions outlined (run OutlineHierarchyScopes first)
  */
 Pass ConvertTensorToTileOps();
 
@@ -363,7 +356,7 @@ Pass InferTileMemorySpace();
  *
  * Requirements:
  * - Input IR must have tile ops (run ConvertTensorToTileOps first)
- * - Input IR must have InCore scopes outlined (run OutlineIncoreScopes first)
+ * - Input IR must have InCore functions outlined (run OutlineHierarchyScopes first)
  */
 Pass ResolveTransposeLayout();
 
@@ -386,7 +379,7 @@ Pass ResolveBackendOpLayouts();
  *
  * Requirements:
  * - Input IR must have tile ops (run ConvertTensorToTileOps first)
- * - Input IR must have InCore scopes outlined (run OutlineIncoreScopes first)
+ * - Input IR must have InCore functions outlined (run OutlineHierarchyScopes first)
  */
 Pass ExpandMixedKernel();
 
diff --git a/include/pypto/ir/transforms/utils/scope_outline_utils.h b/include/pypto/ir/transforms/utils/scope_outline_utils.h
index 2a52d6fed..d60d9e1be 100644
--- a/include/pypto/ir/transforms/utils/scope_outline_utils.h
+++ b/include/pypto/ir/transforms/utils/scope_outline_utils.h
@@ -199,13 +199,29 @@ class VarCollector : public IRVisitor {
  * and a naming suffix. Handles SeqStmts specially to determine which scope-defined
  * variables are actually used after each scope (output filtering), and recursively
  * transforms scope bodies to handle nested scopes.
+ *
+ * For HierarchyScopeStmt, an optional `level_filter_` narrows which scopes are
+ * outlined: when set with mode `Only`, only scopes whose `level_` matches are
+ * outlined; when set with mode `Exclude`, scopes at the matching level are
+ * skipped (and the mutator descends into their body to outline nested scopes
+ * normally). Used to split outlining into two passes: `OutlineHierarchyScopes`
+ * (excludes CORE_GROUP, emits Opaque) and `OutlineIncoreScopes` (only
+ * CORE_GROUP, emits InCore).
  */
 class ScopeOutliner : public IRMutator {
  public:
+  /// Hierarchy-level filter for ScopeOutliner.
+  struct HierarchyLevelFilter {
+    enum class Mode { Only, Exclude };
+    Level level;
+    Mode mode;
+  };
+
   ScopeOutliner(std::string func_name, const std::unordered_map<const Var*, TypePtr>& var_types,
                 const std::unordered_map<const Var*, VarPtr>& var_objects,
                 const std::unordered_set<std::string>& known_names, ScopeKind target_scope_kind,
-                FunctionType outlined_func_type, std::string name_suffix, ProgramPtr program = nullptr)
+                FunctionType outlined_func_type, std::string name_suffix, ProgramPtr program = nullptr,
+                std::optional<HierarchyLevelFilter> level_filter = std::nullopt)
       : func_name_(std::move(func_name)),
         var_types_(var_types),
         var_objects_(var_objects),
@@ -213,7 +229,8 @@ class ScopeOutliner : public IRMutator {
         target_scope_kind_(target_scope_kind),
         outlined_func_type_(outlined_func_type),
         name_suffix_(std::move(name_suffix)),
-        program_(std::move(program)) {}
+        program_(std::move(program)),
+        level_filter_(level_filter) {}
 
   [[nodiscard]] const std::vector<FunctionPtr>& GetOutlinedFunctions() const { return outlined_functions_; }
 
@@ -244,7 +261,7 @@ class ScopeOutliner : public IRMutator {
 
     for (size_t i = 0; i < op->stmts_.size(); ++i) {
       auto scope = std::dynamic_pointer_cast<const ScopeStmt>(op->stmts_[i]);
-      if (scope && scope->GetScopeKind() == target_scope_kind_) {
+      if (scope && scope->GetScopeKind() == target_scope_kind_ && ShouldOutline(scope)) {
         // Collect variables referenced in all subsequent statements
         VarDefUseCollector after_ref_collector;
         for (size_t j = i + 1; j < op->stmts_.size(); ++j) {
@@ -300,6 +317,9 @@ class ScopeOutliner : public IRMutator {
     if (op->GetScopeKind() != target_scope_kind_) {
       return IRMutator::VisitStmt_(op);
     }
+    if (!ShouldOutline(op)) {
+      return IRMutator::VisitStmt_(op);
+    }
     VarDefUseCollector def_collector;
     def_collector.VisitStmt(op->body_);
     StoreTargetCollector store_collector;
@@ -308,12 +328,21 @@ class ScopeOutliner : public IRMutator {
     return OutlineScope(op, def_collector.var_defs);
   }
 
-  StmtPtr VisitStmt_(const InCoreScopeStmtPtr& op) override { return VisitScopeKind(op); }
-  StmtPtr VisitStmt_(const AutoInCoreScopeStmtPtr& op) override { return VisitScopeKind(op); }
   StmtPtr VisitStmt_(const ClusterScopeStmtPtr& op) override { return VisitScopeKind(op); }
   StmtPtr VisitStmt_(const HierarchyScopeStmtPtr& op) override { return VisitScopeKind(op); }
   StmtPtr VisitStmt_(const SpmdScopeStmtPtr& op) override { return VisitScopeKind(op); }
 
+  /// Apply the optional hierarchy-level filter. Non-Hierarchy scopes are
+  /// unaffected; Hierarchy scopes are matched against `level_filter_.level`
+  /// and accepted/rejected per `level_filter_.mode`.
+  bool ShouldOutline(const ScopeStmtPtr& op) const {
+    if (!level_filter_.has_value()) return true;
+    auto hier = As<HierarchyScopeStmt>(op);
+    if (!hier) return true;
+    bool matches = (hier->level_ == level_filter_->level);
+    return level_filter_->mode == HierarchyLevelFilter::Mode::Only ? matches : !matches;
+  }
+
  private:
   /**
    * @brief Outline a single scope into a separate function.
@@ -540,18 +569,12 @@ class ScopeOutliner : public IRMutator {
       outlined_body = std::make_shared<SeqStmts>(body_stmts, op->span_);
     }
 
-    // Register the outlined function (propagate level/role from ScopeStmt, convert split/core_num to attrs)
+    // Register the outlined function (propagate level/role/split from ScopeStmt, convert split/core_num to
+    // attrs). When outlining a HierarchyScopeStmt at Level::CORE_GROUP, the outlined function becomes
+    // FunctionType::InCore regardless of the default outlined_func_type_ — this replaces the former
+    // OutlineIncoreScopes pass.
     std::vector<std::pair<std::string, std::any>> outlined_attrs;
-    auto append_split_attr = [&](std::optional<SplitMode> split) {
-      if (split.has_value() && split.value() != SplitMode::None) {
-        outlined_attrs.emplace_back("split", static_cast<int>(split.value()));
-      }
-    };
-    if (auto incore = As<InCoreScopeStmt>(op)) {
-      append_split_attr(incore->split_);
-    } else if (auto auto_incore = As<AutoInCoreScopeStmt>(op)) {
-      append_split_attr(auto_incore->split_);
-    } else if (auto spmd = As<SpmdScopeStmt>(op)) {
+    if (auto spmd = As<SpmdScopeStmt>(op)) {
       outlined_attrs.emplace_back("core_num", spmd->core_num_);
       if (spmd->sync_start_) {
         outlined_attrs.emplace_back("sync_start", true);
@@ -562,6 +585,9 @@ class ScopeOutliner : public IRMutator {
     if (auto hier = As<HierarchyScopeStmt>(op)) {
       outlined_level = hier->level_;
       outlined_role = hier->role_;
+      if (hier->split_.has_value() && hier->split_.value() != SplitMode::None) {
+        outlined_attrs.emplace_back("split", static_cast<int>(hier->split_.value()));
+      }
     }
     auto outlined_func = std::make_shared<Function>(
         outlined_func_name, input_params, input_param_directions, return_types, outlined_body, op->span_,
@@ -793,6 +819,7 @@ class ScopeOutliner : public IRMutator {
   FunctionType outlined_func_type_;
   std::string name_suffix_;
   ProgramPtr program_;
+  std::optional<HierarchyLevelFilter> level_filter_;
   int scope_counter_ = 0;
   std::vector<FunctionPtr> outlined_functions_;
 };
@@ -807,7 +834,7 @@ class ScopeOutliner : public IRMutator {
 /// have been successfully outlined into separate functions.
 ///
 /// Usage:
-///   ScopeKindAbsenceVerifier<ScopeKind::InCore> verifier(diagnostics, "PassName", "error message");
+///   ScopeKindAbsenceVerifier<ScopeKind::Hierarchy> verifier(diagnostics, "PassName", "error message");
 ///   verifier.VisitStmt(func->body_);
 template <ScopeKind Kind>
 class ScopeKindAbsenceVerifier : public IRVisitor {
@@ -824,8 +851,6 @@ class ScopeKindAbsenceVerifier : public IRVisitor {
     IRVisitor::VisitStmt_(op);
   }
 
-  void VisitStmt_(const InCoreScopeStmtPtr& op) override { CheckKind(op); }
-  void VisitStmt_(const AutoInCoreScopeStmtPtr& op) override { CheckKind(op); }
   void VisitStmt_(const ClusterScopeStmtPtr& op) override { CheckKind(op); }
   void VisitStmt_(const HierarchyScopeStmtPtr& op) override { CheckKind(op); }
   void VisitStmt_(const SpmdScopeStmtPtr& op) override { CheckKind(op); }
diff --git a/include/pypto/ir/verifier/verifier.h b/include/pypto/ir/verifier/verifier.h
index 2fcb11603..59bc00b8e 100644
--- a/include/pypto/ir/verifier/verifier.h
+++ b/include/pypto/ir/verifier/verifier.h
@@ -106,12 +106,6 @@ PropertyVerifierPtr CreateNormalizedStmtPropertyVerifier();
  */
 PropertyVerifierPtr CreateNoRedundantBlocksPropertyVerifier();
 
-/**
- * @brief Factory function for creating SplitIncoreOrch property verifier
- * @return Shared pointer to SplitIncoreOrch PropertyVerifier
- */
-PropertyVerifierPtr CreateSplitIncoreOrchPropertyVerifier();
-
 /**
  * @brief Factory function for creating ClusterOutlined property verifier
  * @return Shared pointer to ClusterOutlined PropertyVerifier
@@ -213,14 +207,6 @@ PropertyVerifierPtr CreateStructuredCtrlFlowPropertyVerifier();
  */
 PropertyVerifierPtr CreateOutParamNotShadowedPropertyVerifier();
 
-/**
- * @brief Factory function for creating NoNestedInCore property verifier
- *
- * Verifies that no ScopeStmt(InCore) is nested inside another ScopeStmt(InCore).
- * @return Shared pointer to NoNestedInCore PropertyVerifier
- */
-PropertyVerifierPtr CreateNoNestedIncorePropertyVerifier();
-
 /**
  * @brief Factory function for creating InOutUseValid property verifier
  *
diff --git a/pyproject.toml b/pyproject.toml
index 5933663e1..315b653ee 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -81,8 +81,6 @@ fixable = ["ALL"]
 "tests/ut/ir/transforms/test_init_memref.py" = ["E501"]
 "tests/ut/ir/transforms/test_memory_reuse.py" = ["E501"]
 "tests/ut/ir/transforms/test_infer_tile_memory_space.py" = ["E501"]
-"tests/ut/ir/transforms/test_interchange_chunk_loops.py" = ["E501", "F841"]
-"tests/ut/ir/transforms/test_outline_incore_interleaved_ops.py" = ["E501"]
 "tests/ut/ir/transforms/test_legalize_pto_buffer_reuse.py" = ["E501", "F841"]
 # IR dumps are formatted at 200-col for readability — suppress line-length lint
 "build_output/**" = ["E501"]
diff --git a/python/bindings/modules/functor.cpp b/python/bindings/modules/functor.cpp
index 0c65960f0..117abbb6e 100644
--- a/python/bindings/modules/functor.cpp
+++ b/python/bindings/modules/functor.cpp
@@ -50,7 +50,7 @@ using namespace pypto::ir;  // NOLINT(build/namespaces)
 
 // --- IRVisitor trampoline ---
 struct PyIRVisitor : IRVisitor {
-  NB_TRAMPOLINE(IRVisitor, 59);  // 31 base + 23 binary + 5 unary (5 scope kinds)
+  NB_TRAMPOLINE(IRVisitor, 57);  // 29 base + 23 binary + 5 unary (3 scope kinds)
 
   // Top-level entry points
   void VisitProgram(const ProgramPtr& p) override { NB_OVERRIDE_NAME("visit_program", VisitProgram, p); }
@@ -128,8 +128,6 @@ struct PyIRVisitor : IRVisitor {
   VISITOR_STMT_TRAMPOLINE(IfStmt, visit_if_stmt)
   VISITOR_STMT_TRAMPOLINE(ForStmt, visit_for_stmt)
   VISITOR_STMT_TRAMPOLINE(WhileStmt, visit_while_stmt)
-  VISITOR_STMT_TRAMPOLINE(InCoreScopeStmt, visit_in_core_scope_stmt)
-  VISITOR_STMT_TRAMPOLINE(AutoInCoreScopeStmt, visit_auto_in_core_scope_stmt)
   VISITOR_STMT_TRAMPOLINE(ClusterScopeStmt, visit_cluster_scope_stmt)
   VISITOR_STMT_TRAMPOLINE(HierarchyScopeStmt, visit_hierarchy_scope_stmt)
   VISITOR_STMT_TRAMPOLINE(SpmdScopeStmt, visit_spmd_scope_stmt)
@@ -143,7 +141,7 @@ struct PyIRVisitor : IRVisitor {
 
 // --- IRMutator trampoline ---
 struct PyIRMutator : IRMutator {
-  NB_TRAMPOLINE(IRMutator, 58);  // 30 base + 23 binary + 5 unary (5 scope kinds)
+  NB_TRAMPOLINE(IRMutator, 56);  // 28 base + 23 binary + 5 unary (3 scope kinds)
 
   // Top-level entry points
   ProgramPtr VisitProgram(const ProgramPtr& p) override {
@@ -222,8 +220,6 @@ struct PyIRMutator : IRMutator {
   MUTATOR_STMT_TRAMPOLINE(IfStmt, visit_if_stmt)
   MUTATOR_STMT_TRAMPOLINE(ForStmt, visit_for_stmt)
   MUTATOR_STMT_TRAMPOLINE(WhileStmt, visit_while_stmt)
-  MUTATOR_STMT_TRAMPOLINE(InCoreScopeStmt, visit_in_core_scope_stmt)
-  MUTATOR_STMT_TRAMPOLINE(AutoInCoreScopeStmt, visit_auto_in_core_scope_stmt)
   MUTATOR_STMT_TRAMPOLINE(ClusterScopeStmt, visit_cluster_scope_stmt)
   MUTATOR_STMT_TRAMPOLINE(HierarchyScopeStmt, visit_hierarchy_scope_stmt)
   MUTATOR_STMT_TRAMPOLINE(SpmdScopeStmt, visit_spmd_scope_stmt)
@@ -350,8 +346,6 @@ void BindFunctor(nb::module_& m) {
   BIND_VISITOR(visitor_cls, IfStmt, visit_if_stmt);
   BIND_VISITOR(visitor_cls, ForStmt, visit_for_stmt);
   BIND_VISITOR(visitor_cls, WhileStmt, visit_while_stmt);
-  BIND_VISITOR(visitor_cls, InCoreScopeStmt, visit_in_core_scope_stmt);
-  BIND_VISITOR(visitor_cls, AutoInCoreScopeStmt, visit_auto_in_core_scope_stmt);
   BIND_VISITOR(visitor_cls, ClusterScopeStmt, visit_cluster_scope_stmt);
   BIND_VISITOR(visitor_cls, HierarchyScopeStmt, visit_hierarchy_scope_stmt);
   BIND_VISITOR(visitor_cls, SpmdScopeStmt, visit_spmd_scope_stmt);
@@ -445,8 +439,6 @@ void BindFunctor(nb::module_& m) {
   BIND_MUTATOR(mutator_cls, IfStmt, visit_if_stmt);
   BIND_MUTATOR(mutator_cls, ForStmt, visit_for_stmt);
   BIND_MUTATOR(mutator_cls, WhileStmt, visit_while_stmt);
-  BIND_MUTATOR(mutator_cls, InCoreScopeStmt, visit_in_core_scope_stmt);
-  BIND_MUTATOR(mutator_cls, AutoInCoreScopeStmt, visit_auto_in_core_scope_stmt);
   BIND_MUTATOR(mutator_cls, ClusterScopeStmt, visit_cluster_scope_stmt);
   BIND_MUTATOR(mutator_cls, HierarchyScopeStmt, visit_hierarchy_scope_stmt);
   BIND_MUTATOR(mutator_cls, SpmdScopeStmt, visit_spmd_scope_stmt);
diff --git a/python/bindings/modules/ir.cpp b/python/bindings/modules/ir.cpp
index c85fb6ab6..fc72bf72f 100644
--- a/python/bindings/modules/ir.cpp
+++ b/python/bindings/modules/ir.cpp
@@ -927,10 +927,8 @@ void BindIR(nb::module_& m) {
 
   // ScopeKind enum
   nb::enum_<ScopeKind>(ir, "ScopeKind", "Scope kind classification")
-      .value("InCore", ScopeKind::InCore, "InCore scope for AICore sub-graphs")
-      .value("AutoInCore", ScopeKind::AutoInCore, "AutoInCore scope for automatic chunking")
       .value("Cluster", ScopeKind::Cluster, "Cluster scope for co-scheduled AIC + AIV groups")
-      .value("Hierarchy", ScopeKind::Hierarchy, "Distributed hierarchy scope (uses level/role)")
+      .value("Hierarchy", ScopeKind::Hierarchy, "Distributed hierarchy scope (uses level/role/split)")
       .value("Spmd", ScopeKind::Spmd, "SPMD dispatch scope (core_num/sync_start)")
       .export_values();
 
@@ -947,23 +945,6 @@ void BindIR(nb::module_& m) {
   scope_stmt_class.def_prop_ro("scope_kind", &ScopeStmt::GetScopeKind, "Discriminator for the scope kind");
   BindFields<ScopeStmt>(scope_stmt_class);  // exposes name_hint, body
 
-  // InCoreScopeStmt
-  auto in_core_scope_stmt_class =
-      nb::class_<InCoreScopeStmt, ScopeStmt>(ir, "InCoreScopeStmt", "InCore scope: AICore sub-graph region");
-  in_core_scope_stmt_class.def(nb::init<std::optional<SplitMode>, std::string, const StmtPtr&, const Span&>(),
-                               nb::arg("split") = nb::none(), nb::arg("name_hint") = "", nb::arg("body"),
-                               nb::arg("span"), "Create an InCore scope statement");
-  BindFields<InCoreScopeStmt>(in_core_scope_stmt_class);
-
-  // AutoInCoreScopeStmt
-  auto auto_in_core_scope_stmt_class = nb::class_<AutoInCoreScopeStmt, ScopeStmt>(
-      ir, "AutoInCoreScopeStmt", "AutoInCore scope: InCore region with automatic chunking");
-  auto_in_core_scope_stmt_class.def(
-      nb::init<std::optional<SplitMode>, std::string, const StmtPtr&, const Span&>(),
-      nb::arg("split") = nb::none(), nb::arg("name_hint") = "", nb::arg("body"), nb::arg("span"),
-      "Create an AutoInCore scope statement");
-  BindFields<AutoInCoreScopeStmt>(auto_in_core_scope_stmt_class);
-
   // ClusterScopeStmt
   auto cluster_scope_stmt_class = nb::class_<ClusterScopeStmt, ScopeStmt>(
       ir, "ClusterScopeStmt", "Cluster scope: co-scheduled AIC + AIV group");
@@ -975,10 +956,11 @@ void BindIR(nb::module_& m) {
   // HierarchyScopeStmt
   auto hierarchy_scope_stmt_class = nb::class_<HierarchyScopeStmt, ScopeStmt>(
       ir, "HierarchyScopeStmt", "Hierarchy scope: distributed-hierarchy region");
-  hierarchy_scope_stmt_class.def(
-      nb::init<Level, std::optional<Role>, std::string, const StmtPtr&, const Span&>(), nb::arg("level"),
-      nb::arg("role") = nb::none(), nb::arg("name_hint") = "", nb::arg("body"), nb::arg("span"),
-      "Create a Hierarchy scope statement");
+  hierarchy_scope_stmt_class.def(nb::init<Level, std::optional<Role>, std::optional<SplitMode>, std::string,
+                                          const StmtPtr&, const Span&>(),
+                                 nb::arg("level"), nb::arg("role") = nb::none(),
+                                 nb::arg("split") = nb::none(), nb::arg("name_hint") = "", nb::arg("body"),
+                                 nb::arg("span"), "Create a Hierarchy scope statement");
   BindFields<HierarchyScopeStmt>(hierarchy_scope_stmt_class);
 
   // SpmdScopeStmt
diff --git a/python/bindings/modules/passes.cpp b/python/bindings/modules/passes.cpp
index a59c36e13..0d6b382ed 100644
--- a/python/bindings/modules/passes.cpp
+++ b/python/bindings/modules/passes.cpp
@@ -48,7 +48,6 @@ void BindPass(nb::module_& m) {
       .value("NoNestedCalls", IRProperty::NoNestedCalls, "No nested call expressions")
       .value("NormalizedStmtStructure", IRProperty::NormalizedStmtStructure, "Statement structure normalized")
       .value("NoRedundantBlocks", IRProperty::NoRedundantBlocks, "No single-child or nested SeqStmts")
-      .value("SplitIncoreOrch", IRProperty::SplitIncoreOrch, "InCore scopes outlined into separate functions")
       .value("HasMemRefs", IRProperty::HasMemRefs, "MemRef objects initialized on variables")
       .value("IncoreTileOps", IRProperty::IncoreTileOps,
              "InCore functions use tile ops (tile types, load/store)")
@@ -69,9 +68,7 @@ void BindPass(nb::module_& m) {
              "No BreakStmt/ContinueStmt — only structured control flow")
       .value("VectorKernelSplit", IRProperty::VectorKernelSplit,
              "AIV functions with split mode have tpop shapes and store offsets adjusted")
-      .value("OutParamNotShadowed", IRProperty::OutParamNotShadowed, "Out/InOut params are not reassigned")
-      .value("NoNestedInCore", IRProperty::NoNestedInCore,
-             "No nested InCore scopes (ScopeStmt inside ScopeStmt)");
+      .value("OutParamNotShadowed", IRProperty::OutParamNotShadowed, "Out/InOut params are not reassigned");
 
   // Bind IRPropertySet
   nb::class_<IRPropertySet>(passes, "IRPropertySet", "A set of IR properties")
@@ -318,10 +315,6 @@ void BindPass(nb::module_& m) {
       .value("USE_BEFORE_DEF", use_after_def::ErrorType::USE_BEFORE_DEF,
              "Variable used before any definition in scope");
 
-  passes.def("split_chunked_loops", &pass::SplitChunkedLoops,
-             "Create a pass that splits chunked loops into nested loops");
-  passes.def("interchange_chunk_loops", &pass::InterchangeChunkLoops,
-             "Create a pass that interchanges chunk loops and inserts InCore scopes");
   passes.def("unroll_loops", &pass::UnrollLoops, "Create a loop unrolling pass");
   passes.def("partial_unroll_tile_loops", &pass::PartialUnrollTileLoops,
              "Lower ``pl.range(N, unroll=F)`` loops at the tile level: replicate the body F\n"
@@ -334,13 +327,15 @@ void BindPass(nb::module_& m) {
   passes.def("ctrl_flow_transform", &pass::CtrlFlowTransform,
              "Create a control flow structuring pass (eliminate break/continue)");
   passes.def("convert_to_ssa", &pass::ConvertToSSA, "Create an SSA conversion pass");
-  passes.def("outline_incore_scopes", &pass::OutlineIncoreScopes,
-             "Create a pass that outlines InCore scopes into separate functions");
   passes.def("outline_cluster_scopes", &pass::OutlineClusterScopes,
              "Create a pass that outlines Cluster scopes into Group functions "
              "and standalone Spmd scopes into Spmd functions");
   passes.def("outline_hierarchy_scopes", &pass::OutlineHierarchyScopes,
-             "Create a pass that outlines Hierarchy scopes into separate level/role functions");
+             "Create a pass that outlines non-CORE_GROUP Hierarchy scopes into separate Opaque "
+             "level/role functions. CORE_GROUP scopes are left for outline_incore_scopes.");
+  passes.def("outline_incore_scopes", &pass::OutlineIncoreScopes,
+             "Create a pass that outlines CORE_GROUP Hierarchy scopes into InCore functions "
+             "and promotes the parent function from Opaque to Orchestration");
   passes.def("convert_tensor_to_tile_ops", &pass::ConvertTensorToTileOps,
              "Create a pass that converts tensor ops to tile ops in InCore functions");
   passes.def("optimize_orch_tensors", &pass::OptimizeOrchTensors,
diff --git a/python/pypto/ir/builder.py b/python/pypto/ir/builder.py
index b46b4e245..6e4981201 100644
--- a/python/pypto/ir/builder.py
+++ b/python/pypto/ir/builder.py
@@ -266,11 +266,11 @@ def scope(
         """Context manager for building scope statements.
 
         Args:
-            scope_kind: The kind of scope (e.g., ir.ScopeKind.InCore)
+            scope_kind: The kind of scope (e.g., ir.ScopeKind.Hierarchy)
             span: Optional explicit span. If None, automatically captured.
             level: Hierarchy level (for ScopeKind.Hierarchy)
             role: Function role (for ScopeKind.Hierarchy)
-            split: Split mode for cross-core transfer (for AutoInCore scopes)
+            split: Split mode for cross-core transfer (for ScopeKind.Hierarchy at CORE_GROUP)
             name_hint: User-provided scope name hint (empty = auto-generate)
             core_num: SPMD block count (for ScopeKind.Spmd scopes)
             sync_start: Require sync-start for SPMD dispatch (for ScopeKind.Spmd scopes)
@@ -279,8 +279,8 @@ def scope(
             ScopeBuilder: Helper object for building the scope statement
 
         Example:
-            >>> with ib.scope(ir.ScopeKind.InCore) as scope_builder:
-            ...     # InCore scope body
+            >>> with ib.scope(ir.ScopeKind.Hierarchy, level=ir.Level.CORE_GROUP) as scope_builder:
+            ...     # CORE_GROUP scope body (outlined to Function(InCore))
             ...     ib.assign(y, add_expr)
         """
         begin_span = span if span is not None else self._capture_call_span()
diff --git a/python/pypto/ir/pass_manager.py b/python/pypto/ir/pass_manager.py
index 3de50d5ad..cd3d7da22 100644
--- a/python/pypto/ir/pass_manager.py
+++ b/python/pypto/ir/pass_manager.py
@@ -124,8 +124,6 @@ def _register_passes(cls):
             ("FlattenCallExpr", lambda: passes.flatten_call_expr()),
         ]
         tensor_only_passes: list[PassSpec] = [
-            ("SplitChunkedLoops", lambda: passes.split_chunked_loops()),
-            ("InterchangeChunkLoops", lambda: passes.interchange_chunk_loops()),
             ("OutlineHierarchyScopes", lambda: passes.outline_hierarchy_scopes()),
             ("OutlineIncoreScopes", lambda: passes.outline_incore_scopes()),
             ("OutlineClusterScopes", lambda: passes.outline_cluster_scopes()),
diff --git a/python/pypto/language/__init__.py b/python/pypto/language/__init__.py
index b2c77df7c..1a12d42de 100644
--- a/python/pypto/language/__init__.py
+++ b/python/pypto/language/__init__.py
@@ -59,12 +59,9 @@ def scalar_func(x: pl.Scalar[pl.FP32]) -> pl.Scalar[pl.FP32]:
 from . import optimizations, parser
 from .dsl_api import (
     at,
-    auto_incore,
-    chunked_loop_optimizer,
     cluster,
     cond,
     const,
-    incore,
     parallel,
     range,
     spmd,
@@ -169,7 +166,7 @@ def scalar_func(x: pl.Scalar[pl.FP32]) -> pl.Scalar[pl.FP32]:
     transpose,
     write,
 )
-from .optimizations import auto_chunk, split
+from .optimizations import split
 from .parser.decorator import InlineFunction, function, inline, program
 from .parser.text_parser import loads, loads_program, parse, parse_program
 from .typing import DynVar, InOut, IntLike, MemRef, Out, Scalar, Tensor, Tile, Tuple, dynamic
@@ -236,14 +233,10 @@ def scalar_func(x: pl.Scalar[pl.FP32]) -> pl.Scalar[pl.FP32]:
     "static_print",
     "static_assert",
     "at",
-    "incore",
-    "auto_incore",
     "cluster",
     "spmd",
-    "chunked_loop_optimizer",
     "optimizations",
     "split",
-    "auto_chunk",
     "tile",
     "system",
     "tensor",
diff --git a/python/pypto/language/dsl_api.py b/python/pypto/language/dsl_api.py
index af019d295..6cd9895a1 100644
--- a/python/pypto/language/dsl_api.py
+++ b/python/pypto/language/dsl_api.py
@@ -17,56 +17,9 @@
     from pypto.language.typing import Scalar, Tensor, Tile
     from pypto.pypto_core import ir
 
-from pypto.pypto_core.ir import SplitMode
 
 from .optimizations import Optimization
 
-
-class _ChunkedLoopOptimizerCall:
-    """Result of calling chunked_loop_optimizer(split=...).
-
-    Stores the split mode to pass to the AutoInCore scope.
-    """
-
-    def __init__(self, split: SplitMode = SplitMode.UP_DOWN) -> None:
-        self.split = split
-
-    def __repr__(self) -> str:
-        return f"chunked_loop_optimizer(split={self.split!r})"
-
-
-class _ChunkedLoopOptimizer:
-    """Sentinel type for optimization=pl.chunked_loop_optimizer in pl.at().
-
-    Can be used bare or called with a split mode:
-    - ``optimization=pl.chunked_loop_optimizer``
-    - ``optimization=pl.chunked_loop_optimizer(split=pl.SplitMode.UP_DOWN)``
-    """
-
-    def __call__(self, *, split: SplitMode = SplitMode.UP_DOWN) -> _ChunkedLoopOptimizerCall:
-        """Create an optimizer specification with an explicit split mode.
-
-        Args:
-            split: Split mode for cross-core data transfer (default: SplitMode.UP_DOWN)
-
-        Returns:
-            Optimizer call with the given split mode
-        """
-        return _ChunkedLoopOptimizerCall(split=split)
-
-    def __repr__(self) -> str:
-        return "chunked_loop_optimizer"
-
-
-chunked_loop_optimizer: _ChunkedLoopOptimizer = _ChunkedLoopOptimizer()
-"""Sentinel for optimization=pl.chunked_loop_optimizer in pl.at().
-
-Use with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer)
-to request compiler-driven chunked loop outlining (replaces pl.auto_incore()).
-Can also be called with a split mode:
-pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer(split=pl.SplitMode.UP_DOWN))
-"""
-
 # Range argument type: int literal or Scalar variable
 RangeArg = Union[int, "Scalar"]
 
@@ -636,96 +589,6 @@ def static_assert(condition: Any, msg: str = "") -> None:
     """
 
 
-class IncoreContext:
-    """Context manager for InCore scope.
-
-    This is returned by pl.incore() and used with the 'with' statement.
-    The parser recognizes this pattern and creates a ScopeStmt(InCore).
-    """
-
-    def __init__(self, split: SplitMode = SplitMode.NONE, name_hint: str = "") -> None:
-        self.split = split
-        self.name_hint = name_hint
-
-    def __enter__(self) -> None:
-        """Enter the InCore scope context."""
-        pass
-
-    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
-        """Exit the InCore scope context."""
-        pass
-
-
-class AutoIncoreContext:
-    """Context manager for AutoInCore scope.
-
-    This is returned by pl.auto_incore() and used with the 'with' statement.
-    The parser recognizes this pattern and creates a ScopeStmt(AutoInCore).
-    """
-
-    def __init__(self, split: SplitMode = SplitMode.NONE, name_hint: str = "") -> None:
-        self.split = split
-        self.name_hint = name_hint
-
-    def __enter__(self) -> None:
-        """Enter the AutoInCore scope context."""
-        pass
-
-    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
-        """Exit the AutoInCore scope context."""
-        pass
-
-
-def auto_incore(split: SplitMode = SplitMode.UP_DOWN, *, name_hint: str = "") -> AutoIncoreContext:
-    """Mark a region of code for automatic incore chunking.
-
-    This function returns a context manager that should be used with the 'with' statement.
-    The parser recognizes this pattern and creates a ScopeStmt with ScopeKind.AutoInCore.
-
-    Args:
-        split: Split mode for cross-core data transfer (default: SplitMode.UP_DOWN)
-
-    Returns:
-        Context manager for AutoInCore scope
-
-    Examples:
-        >>> with pl.auto_incore():
-        ...     for i in pl.parallel(0, 8, 1, chunk=4):
-        ...         x = pl.add(x, x)
-        >>> with pl.auto_incore(split=pl.SplitMode.UP_DOWN):
-        ...     for i in pl.parallel(0, 8, 1, chunk=4):
-        ...         x = pl.add(x, x)
-    """
-    if split == SplitMode.NONE:
-        raise ValueError("SplitMode.NONE is not supported by pto-isa now")
-    return AutoIncoreContext(split=split, name_hint=name_hint)
-
-
-def incore(split: SplitMode = SplitMode.NONE, *, name_hint: str = "") -> IncoreContext:
-    """Mark a region of code as belonging to the InCore execution context.
-
-    This function returns a context manager that should be used with the 'with' statement.
-    The parser recognizes this pattern and creates a ScopeStmt with ScopeKind.InCore.
-
-    Args:
-        split: Split mode for cross-core data transfer (default: SplitMode.NONE).
-            When set, the outlined InCore function will use the specified split
-            mode for data transfer between AIC and AIV cores.
-        name_hint: Optional name hint for the outlined function (must be a valid identifier)
-
-    Returns:
-        Context manager for InCore scope
-
-    Examples:
-        >>> with pl.incore():
-        ...     y = pl.ops.add(x, x)
-        ...     z = pl.ops.mul(y, y)
-        >>> with pl.incore(split=pl.SplitMode.UP_DOWN):
-        ...     y = pl.ops.add(x, x)
-    """
-    return IncoreContext(split=split, name_hint=name_hint)
-
-
 class ClusterContext:
     """Context manager for Cluster scope.
 
@@ -760,7 +623,7 @@ def cluster(*, name_hint: str = "") -> ClusterContext:
 
     Examples:
         >>> with pl.cluster():
-        ...     with pl.incore():
+        ...     with pl.at(level=pl.Level.CORE_GROUP):
         ...         y = pl.add(x, x)
     """
     return ClusterContext(name_hint=name_hint)
@@ -827,11 +690,11 @@ class AtContext:
     """Context manager for hierarchy-level scope.
 
     Returned by pl.at(level=..., role=..., optimizations=[...]) and used with the
-    'with' statement. The parser recognizes this pattern and creates:
-    - ScopeStmt(InCore) when level=CORE_GROUP (no optimizations)
-    - ScopeStmt(InCore, split=...) when level=CORE_GROUP with optimizations=[pl.split(...)]
-    - ScopeStmt(AutoInCore) when level=CORE_GROUP with optimizations=[pl.auto_chunk]
-    - ScopeStmt(Hierarchy) for all other levels
+    'with' statement. The parser emits a HierarchyScopeStmt with:
+    - level = the given level (required)
+    - role = the optional role
+    - split = the SplitMode from optimizations=[pl.split(mode)] (only valid at
+      Level.CORE_GROUP)
     """
 
     def __init__(
@@ -840,16 +703,11 @@ def __init__(
         role: ir.Role | None = None,
         *,
         optimizations: list[Optimization] | None = None,
-        # Deprecated kwargs (kept for back-compat; emit DeprecationWarning at parse time):
-        optimization: _ChunkedLoopOptimizer | _ChunkedLoopOptimizerCall | None = None,
-        split: SplitMode | None = None,
         name_hint: str = "",
     ) -> None:
         self.level = level
         self.role = role
         self.optimizations = optimizations
-        self.optimization = optimization
-        self.split = split
         self.name_hint = name_hint
 
     def __enter__(self) -> None:
@@ -864,62 +722,39 @@ def at(
     role: ir.Role | None = None,
     *,
     optimizations: list[Optimization] | None = None,
-    # Deprecated kwargs (kept for back-compat; emit DeprecationWarning at parse time):
-    optimization: _ChunkedLoopOptimizer | _ChunkedLoopOptimizerCall | None = None,
-    split: SplitMode | None = None,
     name_hint: str = "",
 ) -> AtContext:
     """Mark a region of code for execution at a specific hierarchy level.
 
-    With ``level=pl.Level.CORE_GROUP``, the ``optimizations=`` list controls
-    the resulting scope kind:
-
-    - no entries → ``ScopeStmt(InCore)``
-    - ``pl.split(mode)`` → ``ScopeStmt(InCore, split=mode)``
-    - ``pl.auto_chunk`` → ``ScopeStmt(AutoInCore)``
-    - both entries → ``ScopeStmt(AutoInCore, split=mode)``
-
-    For all other levels, this creates a Hierarchy scope.
+    At ``level=pl.Level.CORE_GROUP`` the optimizations list may contain
+    ``pl.split(mode)`` to request a cross-core data-transfer split mode on the
+    outlined InCore function.
 
     Args:
         level: Target hierarchy level (e.g. pl.Level.HOST, pl.Level.CORE_GROUP).
         role: Function role (Orchestrator or Worker). Default: None.
-        optimizations: Optional list literal of optimization entries. Each
-            entry must be one of ``pl.auto_chunk`` or ``pl.split(mode)`` —
-            written inline at the call site, since the DSL parser inspects
-            the AST and does not accept dynamically built variables here.
-            Entries are independent and may be combined.
-        optimization: **Deprecated.** Use ``optimizations=[pl.auto_chunk]`` (or
-            ``optimizations=[pl.auto_chunk, pl.split(mode)]``) instead.
-        split: **Deprecated.** Use ``optimizations=[pl.split(mode)]`` instead.
+            Not supported with level=CORE_GROUP.
+        optimizations: Optional list literal of optimization entries. Currently
+            only ``pl.split(mode)`` is supported. Must be written inline at the
+            call site — the DSL parser inspects the AST and does not accept
+            dynamically built variables here.
         name_hint: Optional name hint for the outlined function (must be a
             valid identifier).
 
     Returns:
-        Context manager for the appropriate scope.
+        Context manager for a HierarchyScopeStmt.
 
     Examples:
-        >>> # InCore scope (replaces pl.incore()):
+        >>> # CORE_GROUP scope (outlined into Function(InCore)):
         >>> with pl.at(level=pl.Level.CORE_GROUP):
         ...     y = pl.ops.add(x, x)
 
-        >>> # InCore scope with split hint:
+        >>> # CORE_GROUP scope with AIC/AIV split hint:
         >>> with pl.at(level=pl.Level.CORE_GROUP,
         ...            optimizations=[pl.split(pl.SplitMode.UP_DOWN)]):
         ...     y = pl.ops.add(x, x)
 
-        >>> # AutoInCore scope (replaces pl.auto_incore()):
-        >>> with pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.auto_chunk]):
-        ...     for i in pl.parallel(0, 8, 1, chunk=4):
-        ...         x = pl.add(x, x)
-
-        >>> # AutoInCore + split hint (combined, independent entries):
-        >>> with pl.at(level=pl.Level.CORE_GROUP,
-        ...            optimizations=[pl.auto_chunk, pl.split(pl.SplitMode.UP_DOWN)]):
-        ...     for i in pl.parallel(0, 8, 1, chunk=4):
-        ...         x = pl.add(x, x)
-
-        >>> # Hierarchy scope (unchanged behavior):
+        >>> # Hierarchy scope with role (non-CORE_GROUP levels):
         >>> with pl.at(level=pl.Level.HOST, role=pl.Role.Worker):
         ...     y = pl.add(x, x)
     """
@@ -927,8 +762,6 @@ def at(
         level,
         role,
         optimizations=optimizations,
-        optimization=optimization,
-        split=split,
         name_hint=name_hint,
     )
 
@@ -943,16 +776,11 @@ def at(
     "cond",
     "static_print",
     "static_assert",
-    "incore",
-    "auto_incore",
     "at",
     "cluster",
     "spmd",
-    "chunked_loop_optimizer",
     "RangeIterator",
     "WhileIterator",
-    "IncoreContext",
-    "AutoIncoreContext",
     "ClusterContext",
     "SpmdContext",
     "AtContext",
diff --git a/python/pypto/language/optimizations.py b/python/pypto/language/optimizations.py
index 3dea0abb7..f085d7426 100644
--- a/python/pypto/language/optimizations.py
+++ b/python/pypto/language/optimizations.py
@@ -10,22 +10,11 @@
 """Optimization config entries for ``pl.at(..., optimizations=[...])``.
 
 Each entry is an orthogonal optimization hint applied to the enclosing scope.
-The entries can be combined freely in the ``optimizations=`` list.
 
 Available entries:
     - ``pl.split(mode)`` — Cross-core data-transfer split hint, consumed by
-      the ``ExpandMixedKernel`` pass. Lowers the scope to ``InCore`` with
-      ``split_=mode``.
-    - ``pl.auto_chunk`` — Request compiler-driven outlining of chunked
-      parallel loops. Lowers the scope to ``AutoInCore`` so that the
-      ``InterchangeChunkLoops`` pass can interchange and outline chunked
-      loops within it.
-
-These two entries are independent and may be combined::
-
-    with pl.at(level=pl.Level.CORE_GROUP,
-               optimizations=[pl.auto_chunk, pl.split(pl.SplitMode.UP_DOWN)]):
-        ...
+      the ``ExpandMixedKernel`` pass. Only valid at ``Level::CORE_GROUP``;
+      sets ``split`` on the enclosing ``HierarchyScopeStmt``.
 """
 
 from __future__ import annotations
@@ -43,13 +32,9 @@ class Optimization:
 class Split(Optimization):
     """Cross-core data-transfer split hint.
 
-    Sets ``ScopeStmt::split_`` on the enclosing ``pl.at`` scope; that metadata
-    is consumed by the ``ExpandMixedKernel`` pass via the outlined function's
-    ``SplitMode``. The split hint is independent of the resulting scope kind:
-
-    - ``optimizations=[pl.split(mode)]`` → ``ScopeKind::InCore`` (split metadata).
-    - ``optimizations=[pl.auto_chunk, pl.split(mode)]`` → ``ScopeKind::AutoInCore``
-      (split metadata still attached).
+    Sets ``HierarchyScopeStmt::split_`` on the enclosing ``pl.at`` scope.
+    Only valid at ``Level::CORE_GROUP``; consumed by the ``ExpandMixedKernel``
+    pass via the outlined function's ``SplitMode``.
 
     Args:
         mode: Split mode (``SplitMode.UP_DOWN`` or ``SplitMode.LEFT_RIGHT``).
@@ -60,18 +45,6 @@ class Split(Optimization):
     mode: SplitMode
 
 
-@dataclass(frozen=True)
-class AutoChunk(Optimization):
-    """Request compiler-driven outlining of chunked parallel loops.
-
-    Lowers the enclosing ``pl.at`` scope to ``ScopeKind::AutoInCore`` so the
-    ``InterchangeChunkLoops`` pass can interchange chunked parallel loops
-    and outline the inner sequential portion into ``InCore`` scopes.
-
-    Only valid with ``level=pl.Level.CORE_GROUP``.
-    """
-
-
 def split(mode: SplitMode) -> Split:
     """Create a ``Split`` optimization entry.
 
@@ -93,17 +66,8 @@ def split(mode: SplitMode) -> Split:
     return Split(mode=mode)
 
 
-auto_chunk: AutoChunk = AutoChunk()
-"""Sentinel for the ``AutoChunk`` optimization.
-
-Use as ``pl.auto_chunk`` in ``pl.at(..., optimizations=[pl.auto_chunk, ...])``.
-"""
-
-
 __all__ = [
     "Optimization",
     "Split",
-    "AutoChunk",
     "split",
-    "auto_chunk",
 ]
diff --git a/python/pypto/language/parser/ast_parser.py b/python/pypto/language/parser/ast_parser.py
index b67df36e3..8a4fa6ba2 100644
--- a/python/pypto/language/parser/ast_parser.py
+++ b/python/pypto/language/parser/ast_parser.py
@@ -184,14 +184,9 @@ class _AtKwargState:
     level: "ir.Level | None" = None
     role: "ir.Role | None" = None
     name_hint: str = ""
-    requests_auto_chunk: bool = False
     split_mode: "ir.SplitMode | None" = None
-    # Tracks which kwarg produced the AutoChunk / split state so the validation
-    # step can reject mixing the new `optimizations=` list with the deprecated
-    # `optimization=`/`split=` kwargs and emit DeprecationWarning at the end.
+    # Tracks whether optimizations= was already consumed so we can reject duplicates.
     new_optimizations_kw: "ast.keyword | None" = field(default=None)
-    legacy_optimization_kw: "ast.keyword | None" = field(default=None)
-    legacy_split_kw: "ast.keyword | None" = field(default=None)
 
 
 class ASTParser:
@@ -1302,14 +1297,6 @@ def parse_for_loop(self, stmt: ast.For) -> None:  # noqa: PLR0912
 
     def _validate_chunk_args(self, chunk_expr: Any, init_values: list[Any], iter_call: ast.Call) -> None:
         """Validate chunk arguments for range/parallel/unroll loops."""
-        if not self._is_inside_scope(ir.ScopeKind.AutoInCore):
-            raise ParserSyntaxError(
-                "chunk=... loops are only valid inside "
-                "with pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.auto_chunk]):",
-                span=self.span_tracker.get_span(iter_call),
-                hint="Wrap the loop in 'with pl.at(level=pl.Level.CORE_GROUP, "
-                "optimizations=[pl.auto_chunk]):' or remove the chunk= argument.",
-            )
         if not _is_const_int(chunk_expr):
             raise ParserSyntaxError(
                 "chunk must be a compile-time constant positive integer",
@@ -1927,21 +1914,15 @@ def parse_if_statement(self, stmt: ast.If) -> None:
         self.in_if_stmt = False
         self.current_if_builder = None
 
-    def _parse_at_kwargs(
-        self, call: ast.Call
-    ) -> tuple[ir.Level, ir.Role | None, bool, ir.SplitMode | None, str]:
-        """Extract level, role, AutoChunk request, split mode, and name from pl.at(...) call.
+    def _parse_at_kwargs(self, call: ast.Call) -> tuple[ir.Level, ir.Role | None, ir.SplitMode | None, str]:
+        """Extract level, role, split mode, and name from pl.at(...) call.
 
-        Supports both positional and keyword forms. Preferred new API uses the
-        ``optimizations=[...]`` list with ``pl.split(...)`` and ``pl.auto_chunk``
-        entries. The legacy ``optimization=`` and top-level ``split=`` kwargs
-        are still accepted but emit a DeprecationWarning. Mixing the new
-        ``optimizations=`` list with either deprecated kwarg is a hard error.
+        Supports both positional and keyword forms. The optimizations=[...] list
+        can contain ``pl.split(MODE)`` entries to request a cross-core split
+        (valid only at ``Level.CORE_GROUP``).
 
         Returns:
-            Tuple of (level, role, requests_auto_chunk, split_mode, name_hint).
-            ``requests_auto_chunk`` is True when the resulting scope must be
-            ``AutoInCore`` rather than ``InCore``.
+            Tuple of (level, role, split_mode, name_hint).
         """
         if len(call.args) > 2:
             raise ParserSyntaxError(
@@ -1964,8 +1945,7 @@ def _parse_at_kwargs(
                 hint="Use pl.at(pl.Level.HOST) or pl.at(level=pl.Level.HOST)",
             )
 
-        self._validate_at_kwarg_combinations(state)
-        return state.level, state.role, state.requests_auto_chunk, state.split_mode, state.name_hint
+        return state.level, state.role, state.split_mode, state.name_hint
 
     def _dispatch_at_keyword(self, kw: ast.keyword, state: "_AtKwargState") -> None:
         """Dispatch a single pl.at() keyword argument and update state."""
@@ -1978,11 +1958,13 @@ def _dispatch_at_keyword(self, kw: ast.keyword, state: "_AtKwargState") -> None:
                 raise ParserSyntaxError("pl.at() got multiple values for argument 'role'")
             state.role = extract_enum_value(kw.value, ROLE_MAP, "Role", "pl.Role")
         elif kw.arg == "optimizations":
-            self._handle_at_optimizations_kw(kw, state)
-        elif kw.arg == "optimization":
-            self._handle_at_legacy_optimization_kw(kw, state)
-        elif kw.arg == "split":
-            self._handle_at_legacy_split_kw(kw, state)
+            if state.new_optimizations_kw is not None:
+                raise ParserSyntaxError(
+                    "pl.at() got multiple values for argument 'optimizations'",
+                    span=self.span_tracker.get_span(kw),
+                )
+            state.new_optimizations_kw = kw
+            state.split_mode = self._parse_optimizations_list(kw.value)
         elif kw.arg == "name_hint":
             state.name_hint = self._parse_scope_name_hint(kw.value, "pl.at()")
         elif kw.arg is None:
@@ -1996,113 +1978,28 @@ def _dispatch_at_keyword(self, kw: ast.keyword, state: "_AtKwargState") -> None:
                 hint="Supported arguments: level, role, optimizations, name_hint",
             )
 
-    def _handle_at_optimizations_kw(self, kw: ast.keyword, state: "_AtKwargState") -> None:
-        if state.new_optimizations_kw is not None:
-            raise ParserSyntaxError(
-                "pl.at() got multiple values for argument 'optimizations'",
-                span=self.span_tracker.get_span(kw),
-            )
-        state.new_optimizations_kw = kw
-        state.requests_auto_chunk, state.split_mode = self._parse_optimizations_list(kw.value)
-
-    def _handle_at_legacy_optimization_kw(self, kw: ast.keyword, state: "_AtKwargState") -> None:
-        if state.legacy_optimization_kw is not None:
-            raise ParserSyntaxError(
-                "pl.at() got multiple values for argument 'optimization'",
-                span=self.span_tracker.get_span(kw),
-            )
-        state.legacy_optimization_kw = kw
-        # Bare or called legacy optimizer always implies AutoChunk.
-        state.requests_auto_chunk = True
-        state.split_mode = self._parse_chunked_loop_optimizer(kw.value)
-
-    def _handle_at_legacy_split_kw(self, kw: ast.keyword, state: "_AtKwargState") -> None:
-        if state.legacy_split_kw is not None:
-            raise ParserSyntaxError(
-                "pl.at() got multiple values for argument 'split'",
-                span=self.span_tracker.get_span(kw),
-            )
-        state.legacy_split_kw = kw
-        state.split_mode = self._eval_split_mode(kw.value)
-
-    def _validate_at_kwarg_combinations(self, state: "_AtKwargState") -> None:
-        """Reject illegal kwarg combinations and emit DeprecationWarnings."""
-        # Hard error when mixing new optimizations= with deprecated kwargs.
-        if state.new_optimizations_kw is not None and (
-            state.legacy_optimization_kw is not None or state.legacy_split_kw is not None
-        ):
-            offending = state.legacy_optimization_kw or state.legacy_split_kw
-            assert offending is not None
-            raise ParserSyntaxError(
-                "Cannot mix 'optimizations=' with deprecated 'optimization=' or 'split=' kwargs in pl.at()",
-                span=self.span_tracker.get_span(offending),
-                hint="Use only optimizations=[pl.split(...), pl.auto_chunk] — drop the deprecated kwargs.",
-            )
-
-        # Preserve the pre-existing rule that the two deprecated kwargs cannot be
-        # combined: legacy `optimization=` always implied AutoInCore + a baked-in
-        # split, so combining it with legacy top-level `split=` was ambiguous.
-        if state.legacy_optimization_kw is not None and state.legacy_split_kw is not None:
-            raise ParserSyntaxError(
-                "Cannot use both 'optimization' and 'split' in pl.at()",
-                span=self.span_tracker.get_span(state.legacy_split_kw),
-                hint="Use optimizations=[pl.auto_chunk, pl.split(...)] for AutoInCore + "
-                "split, or optimizations=[pl.split(...)] for plain InCore + split.",
-            )
-
-        # Emit deprecation warnings for legacy kwargs (after mixing checks, so the
-        # user sees the structural error first if both apply).
-        if state.legacy_optimization_kw is not None:
-            warnings.warn(
-                "pl.at(optimization=pl.chunked_loop_optimizer[(...)]) is deprecated; "
-                "use pl.at(optimizations=[pl.auto_chunk]) — combine with pl.split(...) "
-                "if a split mode is needed.",
-                DeprecationWarning,
-                stacklevel=2,
-            )
-        if state.legacy_split_kw is not None:
-            warnings.warn(
-                "pl.at(split=...) is deprecated; use pl.at(optimizations=[pl.split(...)]).",
-                DeprecationWarning,
-                stacklevel=2,
-            )
-
-    def _parse_optimizations_list(self, value: ast.expr) -> tuple[bool, "ir.SplitMode | None"]:
+    def _parse_optimizations_list(self, value: ast.expr) -> "ir.SplitMode | None":
         """Parse pl.at(..., optimizations=[...]) AST node.
 
-        Each entry must be one of:
-
-        - ``pl.auto_chunk`` — request AutoInCore semantics.
-        - ``pl.split(MODE)`` — set the cross-core split mode.
-
-        Both fully qualified forms (``pl.optimizations.auto_chunk``,
-        ``pl.optimizations.split(MODE)``) are also accepted.
+        Each entry must be ``pl.split(MODE)`` — the fully qualified form
+        ``pl.optimizations.split(MODE)`` is also accepted.
 
         Returns:
-            Tuple ``(requests_auto_chunk, split_mode)``.
+            The requested split mode, or ``None`` if no ``pl.split(...)`` entry
+            was provided.
         """
         if not isinstance(value, ast.List):
             raise ParserSyntaxError(
                 "pl.at(optimizations=...) must be a list literal",
                 span=self.span_tracker.get_span(value),
-                hint="Use optimizations=[pl.split(pl.SplitMode.UP_DOWN)] or optimizations=[pl.auto_chunk].",
+                hint="Use optimizations=[pl.split(pl.SplitMode.UP_DOWN)].",
             )
 
-        requests_auto_chunk = False
         split_mode: ir.SplitMode | None = None
-        seen_auto_chunk = False
         seen_split = False
 
         for entry in value.elts:
-            if self._is_pl_auto_chunk(entry):
-                if seen_auto_chunk:
-                    raise ParserSyntaxError(
-                        "Duplicate 'pl.auto_chunk' in optimizations=[...]",
-                        span=self.span_tracker.get_span(entry),
-                    )
-                seen_auto_chunk = True
-                requests_auto_chunk = True
-            elif (mode := self._try_parse_pl_split(entry)) is not None:
+            if (mode := self._try_parse_pl_split(entry)) is not None:
                 if seen_split:
                     raise ParserSyntaxError(
                         "Duplicate 'pl.split(...)' in optimizations=[...]",
@@ -2114,28 +2011,10 @@ def _parse_optimizations_list(self, value: ast.expr) -> tuple[bool, "ir.SplitMod
                 raise ParserSyntaxError(
                     "Unsupported entry in pl.at(optimizations=[...])",
                     span=self.span_tracker.get_span(entry),
-                    hint="Each entry must be pl.auto_chunk or pl.split(pl.SplitMode.X).",
+                    hint="Each entry must be pl.split(pl.SplitMode.X).",
                 )
 
-        return requests_auto_chunk, split_mode
-
-    @staticmethod
-    def _is_pl_auto_chunk(node: ast.expr) -> bool:
-        """Return True if the AST node is ``pl.auto_chunk`` or ``pl.optimizations.auto_chunk``."""
-        if not isinstance(node, ast.Attribute) or node.attr != "auto_chunk":
-            return False
-        # pl.auto_chunk
-        if isinstance(node.value, ast.Name) and node.value.id == "pl":
-            return True
-        # pl.optimizations.auto_chunk
-        if (
-            isinstance(node.value, ast.Attribute)
-            and node.value.attr == "optimizations"
-            and isinstance(node.value.value, ast.Name)
-            and node.value.value.id == "pl"
-        ):
-            return True
-        return False
+        return split_mode
 
     def _try_parse_pl_split(self, node: ast.expr) -> "ir.SplitMode | None":
         """Return the SplitMode if the AST node is ``pl.split(MODE)``; else None.
@@ -2183,65 +2062,6 @@ def _try_parse_pl_split(self, node: ast.expr) -> "ir.SplitMode | None":
             )
         return mode
 
-    def _parse_chunked_loop_optimizer(self, value: ast.expr) -> "ir.SplitMode":
-        """Parse pl.chunked_loop_optimizer or pl.chunked_loop_optimizer(split=...) AST node.
-
-        Returns the split mode to use for the AutoInCore scope.
-        """
-        # Bare: pl.chunked_loop_optimizer
-        if (
-            isinstance(value, ast.Attribute)
-            and value.attr == "chunked_loop_optimizer"
-            and isinstance(value.value, ast.Name)
-            and value.value.id == "pl"
-        ):
-            return ir.SplitMode.UP_DOWN
-
-        # Called: pl.chunked_loop_optimizer(split=pl.SplitMode.UP_DOWN)
-        if (
-            isinstance(value, ast.Call)
-            and isinstance(value.func, ast.Attribute)
-            and value.func.attr == "chunked_loop_optimizer"
-            and isinstance(value.func.value, ast.Name)
-            and value.func.value.id == "pl"
-        ):
-            if value.args:
-                raise ParserSyntaxError(
-                    "pl.chunked_loop_optimizer() does not accept positional arguments",
-                    span=self.span_tracker.get_span(value),
-                    hint="Use: pl.chunked_loop_optimizer(split=pl.SplitMode.UP_DOWN)",
-                )
-            split = ir.SplitMode.UP_DOWN
-            for opt_kw in value.keywords:
-                if opt_kw.arg == "split":
-                    split = extract_enum_value(opt_kw.value, SPLIT_MODE_MAP, "SplitMode", "pl.SplitMode")
-                    if split == ir.SplitMode.NONE:
-                        raise ParserSyntaxError(
-                            "pl.chunked_loop_optimizer() does not support split=pl.SplitMode.NONE",
-                            span=self.span_tracker.get_span(opt_kw.value),
-                            hint="Use pl.SplitMode.UP_DOWN or pl.SplitMode.LEFT_RIGHT",
-                        )
-                else:
-                    raise ParserSyntaxError(
-                        f"pl.chunked_loop_optimizer() got unexpected keyword '{opt_kw.arg}'",
-                        span=self.span_tracker.get_span(opt_kw),
-                        hint="Only 'split' is supported: "
-                        "pl.chunked_loop_optimizer(split=pl.SplitMode.UP_DOWN)",
-                    )
-            return split
-
-        raise ParserSyntaxError(
-            "optimization= only accepts pl.chunked_loop_optimizer or "
-            "pl.chunked_loop_optimizer(split=pl.SplitMode.UP_DOWN)",
-            span=self.span_tracker.get_span(value),
-            hint="Use optimization=pl.chunked_loop_optimizer or "
-            "optimization=pl.chunked_loop_optimizer(split=pl.SplitMode.UP_DOWN)",
-        )
-
-    def _eval_split_mode(self, value: ast.expr) -> "ir.SplitMode":
-        """Extract SplitMode enum value from AST expression."""
-        return extract_enum_value(value, SPLIT_MODE_MAP, "SplitMode", "pl.SplitMode")
-
     def _parse_scope_name_hint(self, value: ast.expr, func_name: str) -> str:
         """Extract and validate a scope name hint from an AST expression.
 
@@ -2274,43 +2094,9 @@ def _parse_legacy_scope(
         func_attr: str,
         scope_kind_map: dict[str, "ir.ScopeKind"],
     ) -> None:
-        """Parse legacy scope context managers (pl.incore, pl.auto_incore, pl.cluster)."""
-        split_mode = None
+        """Parse legacy scope context managers (pl.cluster, pl.spmd)."""
         name_hint = ""
-        if func_attr in ("auto_incore", "incore"):
-            if context_expr.args:
-                raise ParserSyntaxError(
-                    f"pl.{func_attr}() does not accept positional arguments",
-                    span=self.span_tracker.get_span(stmt),
-                    hint=f"Use 'with pl.{func_attr}(split=pl.SplitMode.UP_DOWN):'",
-                )
-            for kw in context_expr.keywords:
-                if kw.arg == "split":
-                    split_mode = self._eval_split_mode(kw.value)
-                elif kw.arg == "name_hint":
-                    name_hint = self._parse_scope_name_hint(kw.value, f"pl.{func_attr}()")
-                else:
-                    raise ParserSyntaxError(
-                        f"pl.{func_attr}() got unexpected keyword argument '{kw.arg}'",
-                        span=self.span_tracker.get_span(stmt),
-                        hint="Supported keywords: 'split', 'name_hint'",
-                    )
-            if func_attr == "incore":
-                warnings.warn(
-                    "pl.incore() is deprecated; use 'with pl.at(level=pl.Level.CORE_GROUP):' "
-                    "(optionally with optimizations=[pl.split(pl.SplitMode.X)]) instead",
-                    DeprecationWarning,
-                    stacklevel=2,
-                )
-            else:
-                warnings.warn(
-                    "pl.auto_incore() is deprecated; use "
-                    "'with pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.auto_chunk]):' "
-                    "(combine with pl.split(pl.SplitMode.X) if a split mode is needed) instead",
-                    DeprecationWarning,
-                    stacklevel=2,
-                )
-        elif func_attr == "cluster":
+        if func_attr == "cluster":
             if context_expr.args:
                 raise ParserSyntaxError(
                     f"pl.{func_attr}() does not accept positional arguments",
@@ -2330,18 +2116,14 @@ def _parse_legacy_scope(
             span = self.span_tracker.get_span(stmt)
             self._parse_scope_body(stmt, scope_kind, span, name_hint=name_hint)
             return
-        elif func_attr == "spmd":
+        if func_attr == "spmd":
             self._parse_spmd_scope(stmt, context_expr, scope_kind_map)
             return
-        elif context_expr.args or context_expr.keywords:
-            raise ParserSyntaxError(
-                f"pl.{func_attr}() does not accept arguments",
-                span=self.span_tracker.get_span(stmt),
-                hint=f"Use 'with pl.{func_attr}():' without arguments",
-            )
-        scope_kind = scope_kind_map[func_attr]
-        span = self.span_tracker.get_span(stmt)
-        self._parse_scope_body(stmt, scope_kind, span, split=split_mode, name_hint=name_hint)
+        raise ParserSyntaxError(
+            f"Unsupported scope context manager 'pl.{func_attr}()'",
+            span=self.span_tracker.get_span(stmt),
+            hint="Supported: pl.cluster(), pl.spmd(...), pl.at(level=...)",
+        )
 
     def _parse_spmd_scope(
         self,
@@ -2465,26 +2247,15 @@ def _parse_scope_body(
                 self.scope_manager.exit_scope(leak_vars=True)
 
     def _parse_at_scope(self, stmt: ast.With, context_expr: ast.Call) -> None:
-        """Parse pl.at(...) context manager into a ScopeStmt."""
-        level, role, requests_auto_chunk, split_mode, name_hint = self._parse_at_kwargs(context_expr)
+        """Parse pl.at(...) context manager into a HierarchyScopeStmt."""
+        level, role, split_mode, name_hint = self._parse_at_kwargs(context_expr)
         span = self.span_tracker.get_span(stmt)
 
         is_core_group = level == ir.Level.CORE_GROUP
 
-        if requests_auto_chunk and not is_core_group:
-            raise ParserSyntaxError(
-                "auto-chunk optimization is only supported with level=pl.Level.CORE_GROUP "
-                "(via optimizations=[pl.auto_chunk] or the deprecated "
-                "optimization=pl.chunked_loop_optimizer)",
-                span=span,
-                hint="Use pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.auto_chunk]) "
-                "for an AutoInCore scope.",
-            )
-
         if split_mode is not None and not is_core_group:
             raise ParserSyntaxError(
-                "split mode is only supported with level=pl.Level.CORE_GROUP "
-                "(via optimizations=[pl.split(...)] or the deprecated split= kwarg)",
+                "split mode is only supported with level=pl.Level.CORE_GROUP",
                 span=span,
                 hint="Use pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.split(pl.SplitMode.UP_DOWN)]).",
             )
@@ -2493,33 +2264,29 @@ def _parse_at_scope(self, stmt: ast.With, context_expr: ast.Call) -> None:
             raise ParserSyntaxError(
                 "role= is not supported with level=pl.Level.CORE_GROUP",
                 span=span,
-                hint="Drop role= for InCore/AutoInCore scopes, "
-                "or use a non-CORE_GROUP level for Hierarchy scope",
+                hint="Drop role= for Level.CORE_GROUP scopes, "
+                "or use a non-CORE_GROUP level for a Hierarchy scope with a role",
             )
 
-        if not is_core_group:
-            self._parse_scope_body(
-                stmt, ir.ScopeKind.Hierarchy, span, level=level, role=role, name_hint=name_hint
-            )
-        elif requests_auto_chunk:
-            self._parse_scope_body(stmt, ir.ScopeKind.AutoInCore, span, split=split_mode, name_hint=name_hint)
-        else:
-            self._parse_scope_body(stmt, ir.ScopeKind.InCore, span, split=split_mode, name_hint=name_hint)
+        self._parse_scope_body(
+            stmt,
+            ir.ScopeKind.Hierarchy,
+            span,
+            level=level,
+            role=role,
+            split=split_mode,
+            name_hint=name_hint,
+        )
 
     def parse_with_statement(self, stmt: ast.With) -> None:
         """Parse with statement for scope contexts.
 
         Currently supports:
-        - with pl.incore(): ... (deprecated; creates ScopeStmt with InCore scope)
-        - with pl.incore(split=pl.SplitMode.UP_DOWN): ... (deprecated; InCore with split)
-        - with pl.auto_incore(): ... (deprecated; creates ScopeStmt with AutoInCore scope)
-        - with pl.auto_incore(split=pl.SplitMode.UP_DOWN): ... (deprecated; with split mode)
-        - with pl.cluster(): ... (creates ScopeStmt with Cluster scope)
-        - with pl.at(level=..., role=...): ... (creates ScopeStmt with InCore/Hierarchy scope)
-        - with pl.at(level=CORE_GROUP): ... (creates ScopeStmt with InCore scope)
-        - with pl.at(level=CORE_GROUP, split=pl.SplitMode.UP_DOWN): ... (InCore with split)
-        - with pl.at(level=CORE_GROUP, optimization=pl.chunked_loop_optimizer): ...
-          (creates ScopeStmt with AutoInCore scope)
+        - with pl.cluster(): ... (creates ClusterScopeStmt)
+        - with pl.spmd(core_num=N): ... (creates SpmdScopeStmt)
+        - with pl.at(level=...): ... (creates HierarchyScopeStmt)
+        - with pl.at(level=CORE_GROUP, optimizations=[pl.split(...)]): ...
+          (creates HierarchyScopeStmt at CORE_GROUP with split mode)
 
         Args:
             stmt: With AST node
@@ -2529,8 +2296,8 @@ def parse_with_statement(self, stmt: ast.With) -> None:
             raise ParserSyntaxError(
                 "Only single context manager supported in with statement",
                 span=self.span_tracker.get_span(stmt),
-                hint="Use 'with pl.incore():', 'with pl.auto_incore():',"
-                " 'with pl.cluster():', or 'with pl.at(level=...):'"
+                hint="Use 'with pl.cluster():', 'with pl.spmd(core_num=N):',"
+                " or 'with pl.at(level=...):'"
                 " without multiple context managers",
             )
 
@@ -2539,8 +2306,6 @@ def parse_with_statement(self, stmt: ast.With) -> None:
 
         # Map DSL function names to ScopeKind values
         _SCOPE_KIND_MAP = {
-            "incore": ir.ScopeKind.InCore,
-            "auto_incore": ir.ScopeKind.AutoInCore,
             "cluster": ir.ScopeKind.Cluster,
             "spmd": ir.ScopeKind.Spmd,
         }
@@ -2548,12 +2313,10 @@ def parse_with_statement(self, stmt: ast.With) -> None:
         if isinstance(context_expr, ast.Call):
             func = context_expr.func
             if isinstance(func, ast.Attribute) and isinstance(func.value, ast.Name) and func.value.id == "pl":
-                # Existing scope kinds: pl.incore(), pl.auto_incore(), pl.cluster()
                 if func.attr in _SCOPE_KIND_MAP:
                     self._parse_legacy_scope(stmt, context_expr, func.attr, _SCOPE_KIND_MAP)
                     return
 
-                # pl.at(level=..., role=..., optimization=...)
                 if func.attr == "at":
                     self._parse_at_scope(stmt, context_expr)
                     return
@@ -2562,8 +2325,8 @@ def parse_with_statement(self, stmt: ast.With) -> None:
         raise UnsupportedFeatureError(
             "Unsupported context manager in with statement",
             span=self.span_tracker.get_span(stmt),
-            hint="Supported: 'with pl.incore():', 'with pl.auto_incore():',"
-            " 'with pl.cluster():', 'with pl.at(level=..., optimization=...):'",
+            hint="Supported: 'with pl.cluster():', 'with pl.spmd(core_num=N):',"
+            " 'with pl.at(level=..., optimizations=[...]):'",
         )
 
     def parse_return(self, stmt: ast.Return) -> None:
diff --git a/python/pypto/pypto_core/ir.pyi b/python/pypto/pypto_core/ir.pyi
index a6c70b632..d75bd6e51 100644
--- a/python/pypto/pypto_core/ir.pyi
+++ b/python/pypto/pypto_core/ir.pyi
@@ -1771,19 +1771,13 @@ class WhileStmt(Stmt):
 class ScopeKind(enum.Enum):
     """Scope kind classification."""
 
-    InCore = 0
-    """InCore scope for AICore sub-graphs."""
-
-    AutoInCore = 1
-    """AutoInCore scope for automatic chunking."""
-
-    Cluster = 2
+    Cluster = 0
     """Cluster scope for co-scheduled AIC + AIV groups."""
 
-    Hierarchy = 3
-    """Distributed hierarchy scope (uses level/role on ScopeStmt)."""
+    Hierarchy = 1
+    """Distributed hierarchy scope (uses level/role/split on ScopeStmt)."""
 
-    Spmd = 4
+    Spmd = 2
     """SPMD dispatch scope (core_num/sync_start on ScopeStmt)."""
 
 class SplitMode(enum.Enum):
@@ -1814,40 +1808,8 @@ class ScopeStmt(Stmt):
     """The nested statements."""
 
     def __init__(self, *args: object, **kwargs: object) -> None:
-        """ScopeStmt is abstract — construct an InCoreScopeStmt, AutoInCoreScopeStmt,
-        ClusterScopeStmt, HierarchyScopeStmt, or SpmdScopeStmt instead."""
-
-class InCoreScopeStmt(ScopeStmt):
-    """InCore scope: AICore sub-graph region."""
-
-    split: Final[SplitMode | None]
-    """Split mode for cross-core transfer (None or SplitMode.None for no split)."""
-
-    def __init__(
-        self,
-        split: SplitMode | None = None,
-        name_hint: str = "",
-        *,
-        body: Stmt,
-        span: Span,
-    ) -> None:
-        """Create an InCore scope statement."""
-
-class AutoInCoreScopeStmt(ScopeStmt):
-    """AutoInCore scope: InCore region with automatic chunking."""
-
-    split: Final[SplitMode | None]
-    """Split mode for cross-core transfer (None or SplitMode.None for no split)."""
-
-    def __init__(
-        self,
-        split: SplitMode | None = None,
-        name_hint: str = "",
-        *,
-        body: Stmt,
-        span: Span,
-    ) -> None:
-        """Create an AutoInCore scope statement."""
+        """ScopeStmt is abstract — construct a ClusterScopeStmt, HierarchyScopeStmt,
+        or SpmdScopeStmt instead."""
 
 class ClusterScopeStmt(ScopeStmt):
     """Cluster scope: co-scheduled AIC + AIV group."""
@@ -1864,10 +1826,14 @@ class HierarchyScopeStmt(ScopeStmt):
     role: Final[Role | None]
     """Function role (Orchestrator or Worker; None for unspecified)."""
 
+    split: Final[SplitMode | None]
+    """AIC/AIV split mode (only valid at Level.CORE_GROUP)."""
+
     def __init__(
         self,
         level: Level,
         role: Role | None = None,
+        split: SplitMode | None = None,
         name_hint: str = "",
         *,
         body: Stmt,
@@ -2661,7 +2627,7 @@ class IRBuilder:
         """Begin building a scope statement.
 
         Args:
-            scope_kind: The kind of scope (e.g., ScopeKind.InCore)
+            scope_kind: The kind of scope (e.g., ScopeKind.Hierarchy)
             span: Source location for scope statement
             level: Hierarchy level (default: None)
             role: Hierarchy scope role (default: None)
@@ -3191,8 +3157,6 @@ class IRVisitor:
     def visit_if_stmt(self, op: IfStmt) -> None: ...
     def visit_for_stmt(self, op: ForStmt) -> None: ...
     def visit_while_stmt(self, op: WhileStmt) -> None: ...
-    def visit_in_core_scope_stmt(self, op: InCoreScopeStmt) -> None: ...
-    def visit_auto_in_core_scope_stmt(self, op: AutoInCoreScopeStmt) -> None: ...
     def visit_cluster_scope_stmt(self, op: ClusterScopeStmt) -> None: ...
     def visit_hierarchy_scope_stmt(self, op: HierarchyScopeStmt) -> None: ...
     def visit_spmd_scope_stmt(self, op: SpmdScopeStmt) -> None: ...
@@ -3268,8 +3232,6 @@ class IRMutator:
     def visit_if_stmt(self, op: IfStmt) -> Stmt: ...
     def visit_for_stmt(self, op: ForStmt) -> Stmt: ...
     def visit_while_stmt(self, op: WhileStmt) -> Stmt: ...
-    def visit_in_core_scope_stmt(self, op: InCoreScopeStmt) -> Stmt: ...
-    def visit_auto_in_core_scope_stmt(self, op: AutoInCoreScopeStmt) -> Stmt: ...
     def visit_cluster_scope_stmt(self, op: ClusterScopeStmt) -> Stmt: ...
     def visit_hierarchy_scope_stmt(self, op: HierarchyScopeStmt) -> Stmt: ...
     def visit_spmd_scope_stmt(self, op: SpmdScopeStmt) -> Stmt: ...
diff --git a/python/pypto/pypto_core/passes.pyi b/python/pypto/pypto_core/passes.pyi
index 2f5e6e93c..69500a5f9 100644
--- a/python/pypto/pypto_core/passes.pyi
+++ b/python/pypto/pypto_core/passes.pyi
@@ -23,7 +23,6 @@ class IRProperty(Enum):
     NoNestedCalls = ...
     NormalizedStmtStructure = ...
     NoRedundantBlocks = ...
-    SplitIncoreOrch = ...
     HasMemRefs = ...
     IncoreTileOps = ...
     AllocatedMemoryAddr = ...
@@ -37,7 +36,6 @@ class IRProperty(Enum):
     StructuredCtrlFlow = ...
     VectorKernelSplit = ...
     OutParamNotShadowed = ...
-    NoNestedInCore = ...
 
 class IRPropertySet:
     """A set of IR properties backed by a bitset."""
@@ -316,12 +314,6 @@ class TypeCheckErrorType(Enum):
     FOR_RANGE_MUST_BE_SCALAR = ...
     CONDITION_MUST_BE_BOOL = ...
 
-def split_chunked_loops() -> Pass:
-    """Create a pass that splits chunked loops into nested loops."""
-
-def interchange_chunk_loops() -> Pass:
-    """Create a pass that interchanges chunk loops and inserts InCore scopes."""
-
 def unroll_loops() -> Pass:
     """Create a loop unrolling pass that expands ForKind.Unroll loops at compile time."""
 
@@ -348,14 +340,14 @@ def ctrl_flow_transform() -> Pass:
 def convert_to_ssa() -> Pass:
     """Create an SSA conversion pass."""
 
-def outline_incore_scopes() -> Pass:
-    """Create a pass that outlines InCore scopes."""
-
 def outline_cluster_scopes() -> Pass:
     """Create a pass that outlines Cluster scopes to Group and standalone Spmd scopes to Spmd."""
 
 def outline_hierarchy_scopes() -> Pass:
-    """Create a pass that outlines Hierarchy scopes into level/role functions."""
+    """Outline non-CORE_GROUP Hierarchy scopes into Opaque level/role functions."""
+
+def outline_incore_scopes() -> Pass:
+    """Outline CORE_GROUP Hierarchy scopes into InCore functions; promote parent to Orchestration."""
 
 def convert_tensor_to_tile_ops() -> Pass:
     """Create a pass that converts tensor ops to tile ops in InCore functions."""
@@ -492,14 +484,12 @@ __all__ = [
     "VerificationError",
     "SSAErrorType",
     "TypeCheckErrorType",
-    "split_chunked_loops",
-    "interchange_chunk_loops",
     "unroll_loops",
     "ctrl_flow_transform",
     "convert_to_ssa",
-    "outline_incore_scopes",
     "outline_cluster_scopes",
     "outline_hierarchy_scopes",
+    "outline_incore_scopes",
     "convert_tensor_to_tile_ops",
     "optimize_orch_tensors",
     "flatten_tile_nd_to_2d",
diff --git a/src/ir/builder.cpp b/src/ir/builder.cpp
index da80e97d3..cc6a26e32 100644
--- a/src/ir/builder.cpp
+++ b/src/ir/builder.cpp
@@ -329,20 +329,13 @@ StmtPtr IRBuilder::EndScope(const Span& end_span) {
   // Dispatch on scope_kind to the matching derived class (issue #1047).
   ScopeStmtPtr scope_stmt;
   switch (scope_kind) {
-    case ScopeKind::InCore:
-      scope_stmt = std::make_shared<const InCoreScopeStmt>(split, std::move(name_hint), body, combined_span);
-      break;
-    case ScopeKind::AutoInCore:
-      scope_stmt =
-          std::make_shared<const AutoInCoreScopeStmt>(split, std::move(name_hint), body, combined_span);
-      break;
     case ScopeKind::Cluster:
       scope_stmt = std::make_shared<const ClusterScopeStmt>(std::move(name_hint), body, combined_span);
       break;
     case ScopeKind::Hierarchy:
       CHECK(level.has_value()) << "Hierarchy scope requires a level";
-      scope_stmt =
-          std::make_shared<const HierarchyScopeStmt>(*level, role, std::move(name_hint), body, combined_span);
+      scope_stmt = std::make_shared<const HierarchyScopeStmt>(*level, role, split, std::move(name_hint), body,
+                                                              combined_span);
       break;
     case ScopeKind::Spmd:
       CHECK(core_num.has_value()) << "Spmd scope requires core_num";
diff --git a/src/ir/serialization/serializer.cpp b/src/ir/serialization/serializer.cpp
index 0584c40e9..62cfdc27e 100644
--- a/src/ir/serialization/serializer.cpp
+++ b/src/ir/serialization/serializer.cpp
@@ -219,8 +219,6 @@ class IRSerializer::Impl {
     SERIALIZE_FIELDS(ReturnStmt);
     SERIALIZE_FIELDS(ForStmt);
     SERIALIZE_FIELDS(WhileStmt);
-    SERIALIZE_FIELDS(InCoreScopeStmt);
-    SERIALIZE_FIELDS(AutoInCoreScopeStmt);
     SERIALIZE_FIELDS(ClusterScopeStmt);
     SERIALIZE_FIELDS(HierarchyScopeStmt);
     SERIALIZE_FIELDS(SpmdScopeStmt);
diff --git a/src/ir/serialization/type_deserializers.cpp b/src/ir/serialization/type_deserializers.cpp
index 00435d888..51f2c5233 100644
--- a/src/ir/serialization/type_deserializers.cpp
+++ b/src/ir/serialization/type_deserializers.cpp
@@ -592,28 +592,6 @@ static std::optional<SplitMode> DeserializeScopeSplit(const msgpack::object& fie
   return split;
 }
 
-// Deserialize InCoreScopeStmt
-static IRNodePtr DeserializeInCoreScopeStmt(const msgpack::object& fields_obj, msgpack::zone& zone,
-                                            DeserializerContext& ctx) {
-  auto span = ctx.DeserializeSpan(GET_FIELD_OBJ("span"));
-  auto split = DeserializeScopeSplit(fields_obj, ctx);
-  auto name_hint = DeserializeScopeNameHint(fields_obj, ctx);
-  auto body = std::static_pointer_cast<const Stmt>(ctx.DeserializeNode(GET_FIELD_OBJ("body"), zone));
-  return std::make_shared<InCoreScopeStmt>(split, std::move(name_hint), body, span,
-                                           DeserializeLeadingComments(fields_obj));
-}
-
-// Deserialize AutoInCoreScopeStmt
-static IRNodePtr DeserializeAutoInCoreScopeStmt(const msgpack::object& fields_obj, msgpack::zone& zone,
-                                                DeserializerContext& ctx) {
-  auto span = ctx.DeserializeSpan(GET_FIELD_OBJ("span"));
-  auto split = DeserializeScopeSplit(fields_obj, ctx);
-  auto name_hint = DeserializeScopeNameHint(fields_obj, ctx);
-  auto body = std::static_pointer_cast<const Stmt>(ctx.DeserializeNode(GET_FIELD_OBJ("body"), zone));
-  return std::make_shared<AutoInCoreScopeStmt>(split, std::move(name_hint), body, span,
-                                               DeserializeLeadingComments(fields_obj));
-}
-
 // Deserialize ClusterScopeStmt
 static IRNodePtr DeserializeClusterScopeStmt(const msgpack::object& fields_obj, msgpack::zone& zone,
                                              DeserializerContext& ctx) {
@@ -641,9 +619,10 @@ static IRNodePtr DeserializeHierarchyScopeStmt(const msgpack::object& fields_obj
     role = static_cast<Role>(role_obj->via.u64);
   }
 
+  auto split = DeserializeScopeSplit(fields_obj, ctx);
   auto name_hint = DeserializeScopeNameHint(fields_obj, ctx);
   auto body = std::static_pointer_cast<const Stmt>(ctx.DeserializeNode(GET_FIELD_OBJ("body"), zone));
-  return std::make_shared<HierarchyScopeStmt>(level, role, std::move(name_hint), body, span,
+  return std::make_shared<HierarchyScopeStmt>(level, role, split, std::move(name_hint), body, span,
                                               DeserializeLeadingComments(fields_obj));
 }
 
@@ -689,10 +668,6 @@ static IRNodePtr DeserializeLegacyScopeStmt(const msgpack::object& fields_obj, m
       << "Legacy ScopeStmt scope_kind must be a string, got msgpack type " << static_cast<int>(kind_obj.type);
   auto kind = StringToScopeKind(kind_obj.as<std::string>());
   switch (kind) {
-    case ScopeKind::InCore:
-      return DeserializeInCoreScopeStmt(fields_obj, zone, ctx);
-    case ScopeKind::AutoInCore:
-      return DeserializeAutoInCoreScopeStmt(fields_obj, zone, ctx);
     case ScopeKind::Cluster:
       return DeserializeClusterScopeStmt(fields_obj, zone, ctx);
     case ScopeKind::Hierarchy:
@@ -937,9 +912,6 @@ static TypeRegistrar _yield_stmt_registrar("YieldStmt", DeserializeYieldStmt);
 static TypeRegistrar _return_stmt_registrar("ReturnStmt", DeserializeReturnStmt);
 static TypeRegistrar _for_stmt_registrar("ForStmt", DeserializeForStmt);
 static TypeRegistrar _while_stmt_registrar("WhileStmt", DeserializeWhileStmt);
-static TypeRegistrar _in_core_scope_stmt_registrar("InCoreScopeStmt", DeserializeInCoreScopeStmt);
-static TypeRegistrar _auto_in_core_scope_stmt_registrar("AutoInCoreScopeStmt",
-                                                        DeserializeAutoInCoreScopeStmt);
 static TypeRegistrar _cluster_scope_stmt_registrar("ClusterScopeStmt", DeserializeClusterScopeStmt);
 static TypeRegistrar _hierarchy_scope_stmt_registrar("HierarchyScopeStmt", DeserializeHierarchyScopeStmt);
 static TypeRegistrar _spmd_scope_stmt_registrar("SpmdScopeStmt", DeserializeSpmdScopeStmt);
diff --git a/src/ir/stmt.cpp b/src/ir/stmt.cpp
index 31bac4898..55860d671 100644
--- a/src/ir/stmt.cpp
+++ b/src/ir/stmt.cpp
@@ -11,6 +11,25 @@
 
 #include "pypto/ir/stmt.h"
 
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "pypto/ir/function.h"
+
 namespace pypto {
-namespace ir {}  // namespace ir
+namespace ir {
+
+HierarchyScopeStmt::HierarchyScopeStmt(Level level, std::optional<Role> role, std::optional<SplitMode> split,
+                                       std::string name_hint, StmtPtr body, Span span,
+                                       std::vector<std::string> leading_comments)
+    : ScopeStmt(std::move(name_hint), std::move(body), std::move(span), std::move(leading_comments)),
+      level_(level),
+      role_(role),
+      split_(split) {
+  CHECK(!split_.has_value() || level_ == Level::CORE_GROUP)
+      << "HierarchyScopeStmt split is only valid at Level::CORE_GROUP";
+}
+
+}  // namespace ir
 }  // namespace pypto
diff --git a/src/ir/transforms/convert_to_ssa_pass.cpp b/src/ir/transforms/convert_to_ssa_pass.cpp
index 2bd1f7ff1..920691655 100644
--- a/src/ir/transforms/convert_to_ssa_pass.cpp
+++ b/src/ir/transforms/convert_to_ssa_pass.cpp
@@ -375,8 +375,7 @@ class SSAConverter {
     if (kind == ObjectKind::ReturnStmt) return ConvertReturn(As<ReturnStmt>(s));
     if (kind == ObjectKind::YieldStmt) return ConvertYield(As<YieldStmt>(s));
     if (kind == ObjectKind::EvalStmt) return ConvertEval(As<EvalStmt>(s));
-    if (kind == ObjectKind::InCoreScopeStmt || kind == ObjectKind::AutoInCoreScopeStmt ||
-        kind == ObjectKind::ClusterScopeStmt || kind == ObjectKind::HierarchyScopeStmt ||
+    if (kind == ObjectKind::ClusterScopeStmt || kind == ObjectKind::HierarchyScopeStmt ||
         kind == ObjectKind::SpmdScopeStmt) {
       return ConvertScope(As<ScopeStmt>(s));
     }
@@ -878,8 +877,6 @@ class SSAConverter {
       result->body_ = body;
       return result;
     };
-    if (auto in_core = As<InCoreScopeStmt>(op)) return rewrite(in_core);
-    if (auto auto_in_core = As<AutoInCoreScopeStmt>(op)) return rewrite(auto_in_core);
     if (auto cluster = As<ClusterScopeStmt>(op)) return rewrite(cluster);
     if (auto hier = As<HierarchyScopeStmt>(op)) return rewrite(hier);
     if (auto spmd = As<SpmdScopeStmt>(op)) return rewrite(spmd);
diff --git a/src/ir/transforms/flatten_call_expr_pass.cpp b/src/ir/transforms/flatten_call_expr_pass.cpp
index e65f3cc6d..8e9eba128 100644
--- a/src/ir/transforms/flatten_call_expr_pass.cpp
+++ b/src/ir/transforms/flatten_call_expr_pass.cpp
@@ -57,8 +57,6 @@ class FlattenCallExprMutator : public IRMutator {
   StmtPtr VisitStmt_(const IfStmtPtr& op) override;
   StmtPtr VisitStmt_(const ForStmtPtr& op) override;
   StmtPtr VisitStmt_(const WhileStmtPtr& op) override;
-  StmtPtr VisitStmt_(const InCoreScopeStmtPtr& op) override;
-  StmtPtr VisitStmt_(const AutoInCoreScopeStmtPtr& op) override;
   StmtPtr VisitStmt_(const ClusterScopeStmtPtr& op) override;
   StmtPtr VisitStmt_(const HierarchyScopeStmtPtr& op) override;
   StmtPtr VisitStmt_(const SpmdScopeStmtPtr& op) override;
@@ -317,19 +315,6 @@ StmtPtr FlattenScopeBody(FlattenCallExprMutator* self, std::vector<StmtPtr>& pen
 }
 }  // namespace
 
-StmtPtr FlattenCallExprMutator::VisitStmt_(const InCoreScopeStmtPtr& op) {
-  auto new_body = FlattenScopeBody(this, pending_stmts_, op->body_);
-  if (new_body.get() == op->body_.get()) return op;
-  return std::make_shared<const InCoreScopeStmt>(op->split_, op->name_hint_, std::move(new_body), op->span_);
-}
-
-StmtPtr FlattenCallExprMutator::VisitStmt_(const AutoInCoreScopeStmtPtr& op) {
-  auto new_body = FlattenScopeBody(this, pending_stmts_, op->body_);
-  if (new_body.get() == op->body_.get()) return op;
-  return std::make_shared<const AutoInCoreScopeStmt>(op->split_, op->name_hint_, std::move(new_body),
-                                                     op->span_);
-}
-
 StmtPtr FlattenCallExprMutator::VisitStmt_(const ClusterScopeStmtPtr& op) {
   auto new_body = FlattenScopeBody(this, pending_stmts_, op->body_);
   if (new_body.get() == op->body_.get()) return op;
@@ -339,7 +324,7 @@ StmtPtr FlattenCallExprMutator::VisitStmt_(const ClusterScopeStmtPtr& op) {
 StmtPtr FlattenCallExprMutator::VisitStmt_(const HierarchyScopeStmtPtr& op) {
   auto new_body = FlattenScopeBody(this, pending_stmts_, op->body_);
   if (new_body.get() == op->body_.get()) return op;
-  return std::make_shared<const HierarchyScopeStmt>(op->level_, op->role_, op->name_hint_,
+  return std::make_shared<const HierarchyScopeStmt>(op->level_, op->role_, op->split_, op->name_hint_,
                                                     std::move(new_body), op->span_);
 }
 
diff --git a/src/ir/transforms/flatten_tile_nd_to_2d_pass.cpp b/src/ir/transforms/flatten_tile_nd_to_2d_pass.cpp
index 35862a990..6c0552309 100644
--- a/src/ir/transforms/flatten_tile_nd_to_2d_pass.cpp
+++ b/src/ir/transforms/flatten_tile_nd_to_2d_pass.cpp
@@ -279,11 +279,7 @@ std::vector<StmtPtr> TransformBody(const std::vector<StmtPtr>& stmts, FlattenCon
         new_scope->body_ = new_body;
         return new_scope;
       };
-      if (auto in_core = As<InCoreScopeStmt>(stmt)) {
-        result.push_back(rewrite(in_core));
-      } else if (auto auto_in_core = As<AutoInCoreScopeStmt>(stmt)) {
-        result.push_back(rewrite(auto_in_core));
-      } else if (auto cluster = As<ClusterScopeStmt>(stmt)) {
+      if (auto cluster = As<ClusterScopeStmt>(stmt)) {
         result.push_back(rewrite(cluster));
       } else if (auto hier = As<HierarchyScopeStmt>(stmt)) {
         result.push_back(rewrite(hier));
diff --git a/src/ir/transforms/fuse_create_assemble_to_slice_pass.cpp b/src/ir/transforms/fuse_create_assemble_to_slice_pass.cpp
index d0b8b7a45..d4e511c8d 100644
--- a/src/ir/transforms/fuse_create_assemble_to_slice_pass.cpp
+++ b/src/ir/transforms/fuse_create_assemble_to_slice_pass.cpp
@@ -515,7 +515,7 @@ ProgramPtr TransformFuseCreateAssembleToSlice(const ProgramPtr& program) {
   return std::make_shared<Program>(std::move(new_functions), program->name_, program->span_);
 }
 
-inline const PassProperties kFuseCreateAssembleToSliceProperties{.required = {IRProperty::SplitIncoreOrch}};
+inline const PassProperties kFuseCreateAssembleToSliceProperties{.required = {IRProperty::HierarchyOutlined}};
 
 }  // namespace
 
diff --git a/src/ir/transforms/interchange_chunk_loops_pass.cpp b/src/ir/transforms/interchange_chunk_loops_pass.cpp
deleted file mode 100644
index a84f269fb..000000000
--- a/src/ir/transforms/interchange_chunk_loops_pass.cpp
+++ /dev/null
@@ -1,887 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-
-#include <any>
-#include <cstddef>
-#include <memory>
-#include <optional>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "pypto/core/error.h"
-#include "pypto/core/logging.h"
-#include "pypto/ir/core.h"
-#include "pypto/ir/expr.h"
-#include "pypto/ir/function.h"
-#include "pypto/ir/program.h"
-#include "pypto/ir/span.h"
-#include "pypto/ir/stmt.h"
-#include "pypto/ir/transforms/base/mutator.h"
-#include "pypto/ir/transforms/base/visitor.h"
-#include "pypto/ir/transforms/pass_properties.h"
-#include "pypto/ir/transforms/passes.h"
-#include "pypto/ir/transforms/utils/auto_name_utils.h"
-#include "pypto/ir/transforms/utils/mutable_copy.h"
-#include "pypto/ir/transforms/utils/transform_utils.h"
-#include "pypto/ir/verifier/verifier.h"
-
-namespace pypto {
-namespace ir {
-
-using Attrs = std::vector<std::pair<std::string, std::any>>;
-
-namespace {
-
-/// Build attrs for a generated loop: copy original attrs (excluding loop_origin) and set the new origin.
-Attrs MakeLoopAttrs(const Attrs& original_attrs, LoopOrigin origin) {
-  Attrs result;
-  for (const auto& [key, value] : original_attrs) {
-    if (key != "loop_origin") result.emplace_back(key, value);
-  }
-  result.emplace_back("loop_origin", origin);
-  return result;
-}
-
-/**
- * @brief A single entry in a chunk-loop chain.
- */
-struct ChainEntry {
-  ForStmtPtr for_stmt;
-  LoopOrigin origin;
-};
-
-/**
- * @brief Check if a statement body contains a ScopeStmt(InCore).
- */
-static bool ContainsInCoreScope(const StmtPtr& stmt) {
-  if (!stmt) return false;
-
-  auto kind = stmt->GetKind();
-  switch (kind) {
-    case ObjectKind::InCoreScopeStmt:
-      return true;
-    case ObjectKind::AutoInCoreScopeStmt:
-    case ObjectKind::ClusterScopeStmt:
-    case ObjectKind::HierarchyScopeStmt:
-    case ObjectKind::SpmdScopeStmt: {
-      auto scope = std::static_pointer_cast<const ScopeStmt>(stmt);
-      return ContainsInCoreScope(scope->body_);
-    }
-    case ObjectKind::SeqStmts: {
-      auto seq = std::static_pointer_cast<const SeqStmts>(stmt);
-      for (const auto& s : seq->stmts_) {
-        if (ContainsInCoreScope(s)) return true;
-      }
-      return false;
-    }
-    case ObjectKind::ForStmt: {
-      auto for_stmt = std::static_pointer_cast<const ForStmt>(stmt);
-      return ContainsInCoreScope(for_stmt->body_);
-    }
-    default:
-      return false;
-  }
-}
-
-static bool IsComputeTensorOp(const std::string& op_name) {
-  return transform_utils::IsComputeTensorOp(op_name);
-}
-
-class ComputeTensorOpDetector : public IRVisitor {
- public:
-  [[nodiscard]] bool Found() const { return found_; }
-
-  void VisitExpr_(const CallPtr& op) override {
-    if (!op || found_) return;
-    if (op->op_ && IsComputeTensorOp(op->op_->name_)) {
-      found_ = true;
-      return;
-    }
-    IRVisitor::VisitExpr_(op);
-  }
-
- private:
-  bool found_ = false;
-};
-
-static bool ContainsComputeTensorOp(const StmtPtr& stmt) {
-  if (!stmt) return false;
-  ComputeTensorOpDetector detector;
-  detector.VisitStmt(stmt);
-  return detector.Found();
-}
-
-/// Detects whether an expression tree contains any sub-expression with TensorType or TileType.
-class TensorOrTileTypedExprDetector : public IRVisitor {
- public:
-  [[nodiscard]] bool Found() const { return found_; }
-
-  void VisitExpr(const ExprPtr& expr) override {
-    if (!expr || found_) return;
-    auto type = expr->GetType();
-    if (type) {
-      auto kind = type->GetKind();
-      if (kind == ObjectKind::TensorType || kind == ObjectKind::TileType) {
-        found_ = true;
-        return;
-      }
-    }
-    IRVisitor::VisitExpr(expr);
-  }
-
- private:
-  bool found_ = false;
-};
-
-/// Returns true if stmt is an AssignStmt with a scalar-typed target variable
-/// and a value expression that involves no tensor/tile data.
-static bool IsPureScalarAssignment(const StmtPtr& stmt) {
-  if (!stmt) return false;
-
-  auto kind = stmt->GetKind();
-  if (kind == ObjectKind::AssignStmt) {
-    auto assign = std::static_pointer_cast<const AssignStmt>(stmt);
-    auto var_type = assign->var_->GetType();
-    if (!var_type || var_type->GetKind() != ObjectKind::ScalarType) return false;
-    TensorOrTileTypedExprDetector detector;
-    detector.VisitExpr(assign->value_);
-    return !detector.Found();
-  }
-
-  return false;
-}
-
-static bool ContainsChunkLoop(const StmtPtr& stmt) {
-  if (!stmt) return false;
-
-  auto kind = stmt->GetKind();
-  switch (kind) {
-    case ObjectKind::ForStmt: {
-      auto for_stmt = std::static_pointer_cast<const ForStmt>(stmt);
-      return for_stmt->GetAttr<LoopOrigin>("loop_origin") != LoopOrigin::Original ||
-             ContainsChunkLoop(for_stmt->body_);
-    }
-    case ObjectKind::SeqStmts: {
-      auto seq = std::static_pointer_cast<const SeqStmts>(stmt);
-      for (const auto& s : seq->stmts_) {
-        if (ContainsChunkLoop(s)) return true;
-      }
-      return false;
-    }
-    case ObjectKind::InCoreScopeStmt:
-    case ObjectKind::AutoInCoreScopeStmt:
-    case ObjectKind::ClusterScopeStmt:
-    case ObjectKind::HierarchyScopeStmt:
-    case ObjectKind::SpmdScopeStmt: {
-      auto scope = std::static_pointer_cast<const ScopeStmt>(stmt);
-      return ContainsChunkLoop(scope->body_);
-    }
-    default:
-      return false;
-  }
-}
-
-/**
- * @brief Check whether a statement needs an InCore wrapper after auto_incore is consumed.
- *
- * We only wrap statements that still need outlining:
- * - compute tensor ops
- * - chunk loops that failed interchange or remain sequential
- *
- * The following stay in orchestration (not wrapped):
- * - Pure host-side groups (tensor.assemble/create/slice)
- * - Pure scalar assignments (e.g., index arithmetic like `offset = ob * 32`)
- *   whose value expression contains no tensor/tile-typed sub-expressions
- */
-static bool NeedsInCoreWrapping(const StmtPtr& stmt) {
-  if (!stmt) return false;
-
-  auto kind = stmt->GetKind();
-  if (kind == ObjectKind::YieldStmt || kind == ObjectKind::ReturnStmt) return false;
-  if (ContainsInCoreScope(stmt)) return false;
-  if (IsPureScalarAssignment(stmt)) return false;
-
-  return ContainsChunkLoop(stmt) || ContainsComputeTensorOp(stmt);
-}
-
-/**
- * @brief Wrap statements that lack InCore coverage in ScopeStmt(InCore).
- *
- * After InterchangeChunkLoops processes the auto_incore body, some statements
- * (standalone tensor ops, non-chunked loops, failed-interchange chains) may
- * lack InCore wrapping. This function groups consecutive such statements and
- * wraps each group in ScopeStmt(InCore).
- *
- * Control flow statements (YieldStmt, ReturnStmt) are never wrapped.
- */
-static StmtPtr WrapNonIncoreStatementsInInCore(const StmtPtr& body, const Span& span,
-                                               std::optional<SplitMode> split = std::nullopt) {
-  // When a ForStmt contains InCore scopes in its body (e.g. a pl.range loop
-  // wrapping interchanged parallel chunks), recurse into it so that non-InCore
-  // statements *inside* the loop body also get wrapped.
-  auto maybe_recurse_into_compound = [&](const StmtPtr& s) -> StmtPtr {
-    auto fs = std::dynamic_pointer_cast<const ForStmt>(s);
-    if (fs && ContainsInCoreScope(fs->body_)) {
-      auto new_body = WrapNonIncoreStatementsInInCore(fs->body_, span, split);
-      if (new_body.get() != fs->body_.get()) {
-        auto new_for = MutableCopy(fs);
-        new_for->body_ = new_body;
-        return new_for;
-      }
-    }
-    return s;
-  };
-
-  auto seq = std::dynamic_pointer_cast<const SeqStmts>(body);
-  if (!seq) {
-    if (NeedsInCoreWrapping(body)) {
-      return std::make_shared<InCoreScopeStmt>(split, "", body, span);
-    }
-    return maybe_recurse_into_compound(body);
-  }
-
-  // Check if any wrapping or recursion is needed (fast path)
-  bool has_work = false;
-  for (const auto& s : seq->stmts_) {
-    if (NeedsInCoreWrapping(s)) {
-      has_work = true;
-      break;
-    }
-    auto fs = std::dynamic_pointer_cast<const ForStmt>(s);
-    if (fs && ContainsInCoreScope(fs->body_)) {
-      has_work = true;
-      break;
-    }
-  }
-  if (!has_work) return body;
-
-  // Group consecutive wrappable statements and wrap each group in InCore
-  std::vector<StmtPtr> result;
-  std::vector<StmtPtr> pending;
-
-  auto flush = [&]() {
-    if (pending.empty()) return;
-    StmtPtr content = SeqStmts::Flatten(std::vector<StmtPtr>(pending), span);
-    result.push_back(std::make_shared<InCoreScopeStmt>(split, "", content, span));
-    pending.clear();
-  };
-
-  for (const auto& s : seq->stmts_) {
-    if (NeedsInCoreWrapping(s)) {
-      pending.push_back(s);
-    } else {
-      flush();
-      result.push_back(maybe_recurse_into_compound(s));
-    }
-  }
-  flush();
-
-  return SeqStmts::Flatten(std::move(result), span);
-}
-
-/**
- * @brief Mutator that interchanges ChunkOuter/ChunkInner loops and inserts InCore scopes.
- *
- * After SplitChunkedLoops produces nested ChunkOuter → ChunkInner pairs,
- * this pass reorders them so all outers are on top, wraps inners + body
- * in ScopeStmt(InCore).
- *
- * Only interchanges when ALL ChunkInner loops in the chain have ForKind::Parallel.
- */
-class InterchangeChunkLoopsMutator : public IRMutator {
- public:
-  ExprPtr VisitExpr_(const VarPtr& op) override {
-    auto it = substitution_map_.find(op.get());
-    if (it != substitution_map_.end()) {
-      return it->second;
-    }
-    return op;
-  }
-
-  ExprPtr VisitExpr_(const IterArgPtr& op) override {
-    auto it = substitution_map_.find(op.get());
-    if (it != substitution_map_.end()) {
-      return it->second;
-    }
-    return IRMutator::VisitExpr_(op);
-  }
-
-  StmtPtr VisitStmt_(const AutoInCoreScopeStmtPtr& op) override {
-    bool prev = inside_auto_incore_;
-    auto prev_split = current_split_;
-    inside_auto_incore_ = true;
-    current_split_ = op->split_;
-    auto new_body = VisitStmt(op->body_);
-    inside_auto_incore_ = prev;
-    current_split_ = prev_split;
-    // Consume the AutoInCore wrapper — return body directly.
-    // Wrap any statements that lack InCore coverage, propagating split.
-    new_body = WrapNonIncoreStatementsInInCore(new_body, op->span_, op->split_);
-    return new_body;
-  }
-
-  StmtPtr VisitStmt_(const ForStmtPtr& op) override {
-    if (!inside_auto_incore_) {
-      return IRMutator::VisitStmt_(op);
-    }
-
-    auto loop_origin = op->GetAttr<LoopOrigin>("loop_origin");
-    if (loop_origin == LoopOrigin::ChunkOuter) {
-      return HandleChunkOuter(op);
-    }
-
-    if (loop_origin == LoopOrigin::ChunkRemainder) {
-      return HandleChunkRemainder(op);
-    }
-
-    // Non-chunk loop: recurse normally
-    return IRMutator::VisitStmt_(op);
-  }
-
-  StmtPtr VisitStmt_(const SeqStmtsPtr& op) override {
-    std::vector<StmtPtr> new_stmts;
-    bool changed = false;
-
-    for (const auto& stmt : op->stmts_) {
-      auto new_stmt = VisitStmt(stmt);
-      if (new_stmt.get() != stmt.get()) {
-        changed = true;
-      }
-      // Flatten nested SeqStmts
-      auto seq = std::dynamic_pointer_cast<const SeqStmts>(new_stmt);
-      if (seq) {
-        for (const auto& inner : seq->stmts_) {
-          new_stmts.push_back(inner);
-        }
-      } else {
-        new_stmts.push_back(new_stmt);
-      }
-    }
-
-    if (!changed) {
-      return op;
-    }
-    return SeqStmts::Flatten(std::move(new_stmts), op->span_);
-  }
-
- private:
-  bool inside_auto_incore_ = false;
-  bool inside_incore_context_ = false;
-  std::optional<SplitMode> current_split_;
-  std::unordered_map<const Var*, ExprPtr> substitution_map_;
-
-  /**
-   * @brief Visit a body that will be placed inside an InCore scope.
-   *
-   * Sets inside_incore_context_ so nested chains skip their own InCore wrapping.
-   * Returns whether a parent chain already provides InCore context (prev value).
-   */
-  std::pair<StmtPtr, bool> VisitBodyInIncoreContext(const StmtPtr& body) {
-    bool prev_incore = inside_incore_context_;
-    inside_incore_context_ = true;
-    auto result = VisitStmt(body);
-    inside_incore_context_ = prev_incore;
-    return {result, prev_incore};
-  }
-
-  /**
-   * @brief Collect a chain of chunk loops starting from a ChunkOuter.
-   *
-   * Walk into nested ForStmt bodies, collecting (ForStmt, LoopOrigin) entries.
-   * Stop at non-ForStmt or Original loop.
-   */
-  static std::vector<ChainEntry> CollectChunkChain(const ForStmtPtr& start) {
-    std::vector<ChainEntry> chain;
-    chain.push_back({start, start->GetAttr<LoopOrigin>("loop_origin")});
-
-    StmtPtr body = start->body_;
-
-    // Walk through SeqStmts to find the actual ForStmt body
-    // (body can be SeqStmts with [for_loop, yield])
-    while (true) {
-      ForStmtPtr next_for;
-      auto seq = std::dynamic_pointer_cast<const SeqStmts>(body);
-      if (seq) {
-        // Verify body is exactly {ForStmt} or {ForStmt, YieldStmt}
-        // to ensure no side-effect statements are dropped during rebuild
-        size_t for_count = 0;
-        size_t yield_count = 0;
-        for (const auto& s : seq->stmts_) {
-          auto f = std::dynamic_pointer_cast<const ForStmt>(s);
-          if (f) {
-            next_for = f;
-            ++for_count;
-          } else if (s->GetKind() == ObjectKind::YieldStmt) {
-            ++yield_count;
-          } else {
-            // Non-loop, non-yield statement found — not safe to interchange
-            return chain;
-          }
-        }
-        if (for_count != 1 || yield_count > 1) {
-          return chain;
-        }
-      } else {
-        next_for = std::dynamic_pointer_cast<const ForStmt>(body);
-      }
-
-      if (!next_for) break;
-      auto next_origin = next_for->GetAttr<LoopOrigin>("loop_origin");
-      if (next_origin == LoopOrigin::Original) break;
-
-      chain.push_back({next_for, next_origin});
-      body = next_for->body_;
-    }
-
-    return chain;
-  }
-
-  /**
-   * @brief Handle a ChunkOuter loop: collect chain, check guards, interchange if applicable.
-   */
-  StmtPtr HandleChunkOuter(const ForStmtPtr& op) {
-    auto chain = CollectChunkChain(op);
-
-    // Separate into outers and inners
-    std::vector<ForStmtPtr> outers;
-    std::vector<ForStmtPtr> inners;
-    for (const auto& entry : chain) {
-      if (entry.origin == LoopOrigin::ChunkOuter) {
-        outers.push_back(entry.for_stmt);
-      } else if (entry.origin == LoopOrigin::ChunkInner) {
-        inners.push_back(entry.for_stmt);
-      }
-    }
-
-    // Guard: need at least 1 outer and 1 inner
-    if (outers.empty() || inners.empty()) {
-      return IRMutator::VisitStmt_(op);
-    }
-
-    // Guard: all loops in the chain must have compatible iter_arg arity
-    const size_t ref_iter_args_size = chain.front().for_stmt->iter_args_.size();
-    for (const auto& entry : chain) {
-      if (entry.for_stmt->iter_args_.size() != ref_iter_args_size) {
-        return IRMutator::VisitStmt_(op);
-      }
-    }
-
-    // Guard: all ChunkInner loops must be Parallel
-    for (const auto& inner : inners) {
-      if (inner->kind_ != ForKind::Parallel) {
-        return IRMutator::VisitStmt_(op);
-      }
-    }
-
-    // Guard: no existing InCore scope in innermost body
-    const auto& innermost = chain.back().for_stmt;
-    if (ContainsInCoreScope(innermost->body_)) {
-      return IRMutator::VisitStmt_(op);
-    }
-
-    // Warn if this interchange is nested inside a parent chain's InCore context
-    if (inside_incore_context_) {
-      LOG_WARN << op->span_.filename_ << ":" << op->span_.begin_line_ << " — "
-               << "Nested chunked parallel loop found with intervening statements between it and its parent "
-               << "chunked parallel — the inner chunk will share the parent's InCore scope instead of "
-               << "getting its own. Consider removing the intervening statements or restructuring the loop "
-               << "nest so the chunked parallels are directly nested.";
-    }
-
-    // Perform the interchange
-    return RebuildInterchanged(outers, inners, chain, op->span_);
-  }
-
-  /**
-   * @brief Handle a ChunkRemainder loop: recurse into body and wrap sub-remainder loops in InCore.
-   *
-   * After recursion handles nested chunk chains (via HandleChunkOuter), scan the visited body
-   * for standalone parallel ChunkRemainder sub-loops and wrap each in InCore.
-   */
-  StmtPtr HandleChunkRemainder(const ForStmtPtr& op) {
-    // Create new iter_args BEFORE visiting the body, and register old->new
-    // IterArg mappings in substitution_map_ so body references get rewritten.
-    std::vector<IterArgPtr> new_iter_args;
-    bool iter_args_changed = false;
-    new_iter_args.reserve(op->iter_args_.size());
-    for (const auto& ia : op->iter_args_) {
-      auto new_init = VisitExpr(ia->initValue_);
-      if (new_init.get() != ia->initValue_.get()) {
-        auto new_ia = std::make_shared<IterArg>(ia->name_hint_, ia->GetType(), new_init, ia->span_);
-        new_iter_args.push_back(new_ia);
-        // Register old -> new mapping so body references get rewritten
-        substitution_map_[ia.get()] = new_ia;
-        iter_args_changed = true;
-      } else {
-        new_iter_args.push_back(ia);
-      }
-    }
-
-    // Recurse into the remainder body to handle nested chunk chains
-    auto new_body = VisitStmt(op->body_);
-
-    // Wrap standalone parallel ChunkRemainder sub-loops in InCore
-    new_body = WrapSubRemainderLoopsInInCore(new_body, op->span_, current_split_);
-
-    if (new_body.get() == op->body_.get() && !iter_args_changed) {
-      return op;
-    }
-
-    auto new_for = MutableCopy(op);
-    new_for->iter_args_ = new_iter_args;
-    new_for->body_ = new_body;
-    return new_for;
-  }
-
-  /**
-   * @brief Wrap standalone parallel ChunkRemainder ForStmts in InCore scopes.
-   *
-   * Scans top-level statements in body and wraps each ChunkRemainder loop that is
-   * Parallel and whose body doesn't already contain InCore.
-   */
-  static StmtPtr WrapSubRemainderLoopsInInCore(const StmtPtr& body, const Span& span,
-                                               std::optional<SplitMode> split = std::nullopt) {
-    auto should_wrap = [](const StmtPtr& s) -> bool {
-      auto fs = std::dynamic_pointer_cast<const ForStmt>(s);
-      return fs && fs->GetAttr<LoopOrigin>("loop_origin") == LoopOrigin::ChunkRemainder &&
-             fs->kind_ == ForKind::Parallel && !ContainsInCoreScope(fs->body_);
-    };
-
-    auto seq = std::dynamic_pointer_cast<const SeqStmts>(body);
-    if (seq) {
-      std::vector<StmtPtr> new_stmts;
-      bool changed = false;
-      for (const auto& s : seq->stmts_) {
-        if (should_wrap(s)) {
-          new_stmts.push_back(std::make_shared<InCoreScopeStmt>(split, "", s, span));
-          changed = true;
-        } else {
-          new_stmts.push_back(s);
-        }
-      }
-      if (!changed) return body;
-      return SeqStmts::Flatten(std::move(new_stmts), span);
-    }
-
-    // Single statement
-    if (should_wrap(body)) {
-      return std::make_shared<InCoreScopeStmt>(split, "", body, span);
-    }
-    return body;
-  }
-
-  /**
-   * @brief Rebuild the interchanged loop nest: outers on top, InCore { inners → body }.
-   *
-   * Original chain: O1 → I1 → O2 → I2 → body
-   * Result:         O1 → O2 → InCore{ I1 → I2 → body }
-   *
-   * Iter_args are reconnected so the linear data flow is maintained:
-   * O1.init(original) → O2.init(from O1 iter_arg) → I1.init(from O2 iter_arg)
-   * → I2.init(from I1 iter_arg) → body
-   * Yields reverse the data flow back out.
-   */
-  StmtPtr RebuildInterchanged(const std::vector<ForStmtPtr>& outers, const std::vector<ForStmtPtr>& inners,
-                              const std::vector<ChainEntry>& chain, const Span& span) {
-    bool has_iter_args = !chain[0].for_stmt->iter_args_.empty();
-
-    if (!has_iter_args) {
-      return RebuildSimple(outers, inners, chain, span);
-    }
-
-    return RebuildWithIterArgs(outers, inners, chain, span);
-  }
-
-  /**
-   * @brief Simple rebuild without iter_args.
-   */
-  StmtPtr RebuildSimple(const std::vector<ForStmtPtr>& outers, const std::vector<ForStmtPtr>& inners,
-                        const std::vector<ChainEntry>& chain, const Span& span) {
-    // Get the body from the last loop in inners (not chain.back(), which may be a remainder)
-    const auto& innermost = inners.back();
-
-    auto [body, prev_incore] = VisitBodyInIncoreContext(innermost->body_);
-
-    // Build inners inside-out
-    StmtPtr current = body;
-    for (int i = static_cast<int>(inners.size()) - 1; i >= 0; --i) {
-      const auto& inner = inners[i];
-      current = std::make_shared<ForStmt>(inner->loop_var_, inner->start_, inner->stop_, inner->step_,
-                                          std::vector<IterArgPtr>{}, current, std::vector<VarPtr>{},
-                                          inner->span_, inner->kind_, std::nullopt,
-                                          MakeLoopAttrs(inner->attrs_, LoopOrigin::ChunkInner));
-    }
-
-    // Wrap in InCore — skip if a parent chain already provides InCore context
-    if (!prev_incore) {
-      current = std::make_shared<InCoreScopeStmt>(current_split_, "", current, span);
-    }
-
-    // Build outers inside-out, preserving the original ForKind.
-    for (int i = static_cast<int>(outers.size()) - 1; i >= 0; --i) {
-      const auto& outer = outers[i];
-      current = std::make_shared<ForStmt>(outer->loop_var_, outer->start_, outer->stop_, outer->step_,
-                                          std::vector<IterArgPtr>{}, current, std::vector<VarPtr>{},
-                                          outer->span_, outer->kind_, std::nullopt,
-                                          MakeLoopAttrs(outer->attrs_, LoopOrigin::ChunkOuter));
-    }
-
-    return current;
-  }
-
-  /**
-   * @brief Rebuild with iter_args, reconnecting the SSA data flow.
-   *
-   * Original chain passes iter_args linearly through nested loops:
-   *   O1.init(x_0) → I1.init(from O1_ia) → O2.init(from I1_ia) → I2.init(from O2_ia) → body
-   *
-   * After interchange: O1 → O2 → InCore{ I1 → I2 → body }
-   * New data flow:
-   *   O1.init(x_0) → O2.init(from O1_ia) → I1.init(from O2_ia) → I2.init(from I1_ia) → body
-   */
-  StmtPtr RebuildWithIterArgs(const std::vector<ForStmtPtr>& outers, const std::vector<ForStmtPtr>& inners,
-                              const std::vector<ChainEntry>& chain, const Span& span) {
-    // Reorder the chain entries: outers first, then inners
-    std::vector<ForStmtPtr> reordered;
-    reordered.reserve(outers.size() + inners.size());
-    for (const auto& o : outers) reordered.push_back(o);
-    for (const auto& i : inners) reordered.push_back(i);
-
-    size_t num_iter_args = chain[0].for_stmt->iter_args_.size();
-    size_t total_loops = reordered.size();
-
-    // Create fresh iter_args and return_vars for each loop in the reordered chain
-    std::vector<std::vector<IterArgPtr>> new_iter_args(total_loops);
-    std::vector<std::vector<VarPtr>> new_return_vars(total_loops);
-
-    // The outermost loop gets the original init values from the first chain entry
-    const auto& first_orig = chain[0].for_stmt;
-
-    for (size_t loop_idx = 0; loop_idx < total_loops; ++loop_idx) {
-      const auto& orig_loop = reordered[loop_idx];
-      for (size_t ia_idx = 0; ia_idx < num_iter_args; ++ia_idx) {
-        const auto& orig_ia = first_orig->iter_args_[ia_idx];
-        auto parsed_name = auto_name::Parse(orig_ia->name_hint_);
-        std::string loop_qualifier = auto_name::LoopLevelQualifier(static_cast<int>(loop_idx));
-        std::string combined_qualifier =
-            parsed_name.qualifier.empty() ? loop_qualifier : parsed_name.qualifier + "_" + loop_qualifier;
-        std::string ia_name =
-            auto_name::BuildName(parsed_name.base_name, combined_qualifier, "iter", parsed_name.version);
-        std::string rv_name =
-            auto_name::BuildName(parsed_name.base_name, combined_qualifier, "rv", parsed_name.version);
-
-        ExprPtr init_value;
-        if (loop_idx == 0) {
-          // Outermost: use original init values (apply substitutions for nested chains)
-          init_value = VisitExpr(orig_ia->initValue_);
-        } else {
-          // Chain from previous loop's iter_arg
-          init_value = new_iter_args[loop_idx - 1][ia_idx];
-        }
-
-        auto new_ia = std::make_shared<IterArg>(ia_name, orig_ia->GetType(), init_value, orig_ia->span_);
-        auto new_rv = std::make_shared<Var>(rv_name, orig_ia->GetType(), orig_ia->span_);
-
-        new_iter_args[loop_idx].push_back(new_ia);
-        new_return_vars[loop_idx].push_back(new_rv);
-      }
-    }
-
-    // Now set up substitutions for the body:
-    // The last loop in reordered (last inner) passes its iter_args to the body.
-    // We remap its original iter_args to the new innermost iter_args.
-    // Note: chain.back() may be a ChunkRemainder that is NOT in reordered,
-    // so we must use reordered.back() to get the actual innermost interchange loop.
-    const auto& orig_innermost = reordered.back();
-    size_t innermost_reordered_idx = total_loops - 1;
-
-    for (size_t ia_idx = 0; ia_idx < num_iter_args; ++ia_idx) {
-      substitution_map_[orig_innermost->iter_args_[ia_idx].get()] =
-          new_iter_args[innermost_reordered_idx][ia_idx];
-    }
-
-    // Visit the innermost body with substitutions
-    auto [body, prev_incore] = VisitBodyInIncoreContext(orig_innermost->body_);
-
-    // Build the loop nest inside-out, starting from the innermost (last in reordered)
-    StmtPtr current = body;
-
-    for (int i = static_cast<int>(total_loops) - 1; i >= 0; --i) {
-      const auto& orig_loop = reordered[i];
-      bool is_inner = (orig_loop->GetAttr<LoopOrigin>("loop_origin") == LoopOrigin::ChunkInner);
-
-      // Build yield for this loop from the inner loop's return_vars
-      // (or body's yield values for the innermost)
-      if (!new_return_vars[i].empty()) {
-        std::vector<ExprPtr> yield_values;
-        if (i < static_cast<int>(total_loops) - 1) {
-          // Yield the return vars of the next inner loop
-          for (const auto& rv : new_return_vars[i + 1]) {
-            yield_values.push_back(rv);
-          }
-        } else {
-          // Innermost: body already contains yield, current already has it
-          // Don't add extra yield
-        }
-
-        if (!yield_values.empty()) {
-          auto yield_stmt = std::make_shared<YieldStmt>(yield_values, span);
-          current = SeqStmts::Flatten(std::vector<StmtPtr>{current, yield_stmt}, span);
-        }
-      }
-
-      current = std::make_shared<ForStmt>(
-          orig_loop->loop_var_, orig_loop->start_, orig_loop->stop_, orig_loop->step_, new_iter_args[i],
-          current, new_return_vars[i], orig_loop->span_, orig_loop->kind_, std::nullopt,
-          MakeLoopAttrs(orig_loop->attrs_, is_inner ? LoopOrigin::ChunkInner : LoopOrigin::ChunkOuter));
-
-      // Insert InCore scope right after building all inners (at the boundary).
-      // Skip if a parent chain already provides InCore context.
-      if (!prev_incore && !is_inner && i + 1 < static_cast<int>(total_loops) &&
-          reordered[i + 1]->GetAttr<LoopOrigin>("loop_origin") == LoopOrigin::ChunkInner) {
-        // The current ForStmt body already contains the inner loops.
-        // We need to wrap the inner loop nest (current's body) in InCore.
-        // But current IS the outermost outer that contains inners already.
-        // Actually, we need to insert InCore between the last outer and first inner.
-        // Let's restructure: wrap the body of this outer in InCore.
-        auto outer_for = std::static_pointer_cast<const ForStmt>(current);
-
-        // Extract the body (which is inners + yield)
-        auto incore_body = outer_for->body_;
-        // Separate the yield at the end from the body content
-        auto body_seq = std::dynamic_pointer_cast<const SeqStmts>(incore_body);
-        if (body_seq && body_seq->stmts_.size() >= 2) {
-          // Last stmt should be yield, rest goes into InCore
-          std::vector<StmtPtr> incore_stmts;
-          incore_stmts.reserve(body_seq->stmts_.size() - 1);
-          for (size_t si = 0; si < body_seq->stmts_.size() - 1; ++si) {
-            incore_stmts.push_back(body_seq->stmts_[si]);
-          }
-          auto last_stmt = body_seq->stmts_.back();
-
-          StmtPtr incore_content;
-          if (incore_stmts.size() == 1) {
-            incore_content = incore_stmts[0];
-          } else {
-            incore_content = SeqStmts::Flatten(std::move(incore_stmts), span);
-          }
-
-          auto incore_scope = std::make_shared<InCoreScopeStmt>(current_split_, "", incore_content, span);
-          auto new_body = SeqStmts::Flatten(std::vector<StmtPtr>{incore_scope, last_stmt}, span);
-
-          current = std::make_shared<ForStmt>(
-              outer_for->loop_var_, outer_for->start_, outer_for->stop_, outer_for->step_,
-              outer_for->iter_args_, new_body, outer_for->return_vars_, outer_for->span_, outer_for->kind_,
-              std::nullopt, MakeLoopAttrs(outer_for->attrs_, LoopOrigin::ChunkOuter));
-        } else {
-          // No yield, wrap entire body
-          auto incore_scope = std::make_shared<InCoreScopeStmt>(current_split_, "", incore_body, span);
-          current = std::make_shared<ForStmt>(
-              outer_for->loop_var_, outer_for->start_, outer_for->stop_, outer_for->step_,
-              outer_for->iter_args_, incore_scope, outer_for->return_vars_, outer_for->span_,
-              outer_for->kind_, std::nullopt, MakeLoopAttrs(outer_for->attrs_, LoopOrigin::ChunkOuter));
-        }
-      }
-    }
-
-    // Remap original outer return_vars to new outermost return_vars
-    for (size_t ia_idx = 0; ia_idx < num_iter_args; ++ia_idx) {
-      substitution_map_[first_orig->return_vars_[ia_idx].get()] = new_return_vars[0][ia_idx];
-    }
-
-    return current;
-  }
-};
-
-/**
- * @brief Transform a function by interchanging chunk loops and inserting InCore scopes.
- */
-FunctionPtr TransformInterchangeChunkLoops(const FunctionPtr& func) {
-  INTERNAL_CHECK(func) << "InterchangeChunkLoops cannot run on null function";
-
-  InterchangeChunkLoopsMutator mutator;
-  auto new_body = mutator.VisitStmt(func->body_);
-
-  if (new_body.get() == func->body_.get()) {
-    return func;
-  }
-
-  auto new_func = MutableCopy(func);
-  new_func->body_ = new_body;
-  return new_func;
-}
-
-}  // namespace
-
-// Factory function
-namespace pass {
-Pass InterchangeChunkLoops() {
-  return CreateFunctionPass(TransformInterchangeChunkLoops, "InterchangeChunkLoops",
-                            kInterchangeChunkLoopsProperties);
-}
-}  // namespace pass
-
-// ============================================================================
-// NoNestedInCore structural property verifier
-// ============================================================================
-
-namespace {
-
-constexpr int kNestedIncoreCode = 501;
-
-/// Detects nested ScopeStmt(InCore) scopes in an IR tree.
-class NestedInCoreScopeDetector : public IRVisitor {
- public:
-  explicit NestedInCoreScopeDetector(std::vector<Diagnostic>& diagnostics) : diagnostics_(diagnostics) {}
-
-  void VisitStmt_(const InCoreScopeStmtPtr& op) override {
-    if (!op) return;
-    if (inside_incore_) {
-      diagnostics_.emplace_back(DiagnosticSeverity::Error, "NoNestedInCore", kNestedIncoreCode,
-                                "Nested InCore scope detected — InCore scopes must not contain other "
-                                "InCore scopes",
-                                op->span_);
-    }
-    bool prev = inside_incore_;
-    inside_incore_ = true;
-    IRVisitor::VisitStmt_(op);
-    inside_incore_ = prev;
-  }
-
- private:
-  std::vector<Diagnostic>& diagnostics_;
-  bool inside_incore_ = false;
-};
-
-}  // namespace
-
-class NoNestedIncorePropertyVerifierImpl : public PropertyVerifier {
- public:
-  [[nodiscard]] std::string GetName() const override { return "NoNestedInCore"; }
-
-  void Verify(const ProgramPtr& program, std::vector<Diagnostic>& diagnostics) override {
-    if (!program) return;
-    for (const auto& [gv, func] : program->functions_) {
-      if (!func || !func->body_) continue;
-      NestedInCoreScopeDetector detector(diagnostics);
-      detector.VisitStmt(func->body_);
-    }
-  }
-};
-
-PropertyVerifierPtr CreateNoNestedIncorePropertyVerifier() {
-  return std::make_shared<NoNestedIncorePropertyVerifierImpl>();
-}
-
-}  // namespace ir
-}  // namespace pypto
diff --git a/src/ir/transforms/ir_property.cpp b/src/ir/transforms/ir_property.cpp
index 66cfc80e6..538928a35 100644
--- a/src/ir/transforms/ir_property.cpp
+++ b/src/ir/transforms/ir_property.cpp
@@ -33,8 +33,6 @@ std::string IRPropertyToString(IRProperty prop) {
       return "NormalizedStmtStructure";
     case IRProperty::NoRedundantBlocks:
       return "NoRedundantBlocks";
-    case IRProperty::SplitIncoreOrch:
-      return "SplitIncoreOrch";
     case IRProperty::HasMemRefs:
       return "HasMemRefs";
     case IRProperty::IncoreTileOps:
@@ -61,8 +59,6 @@ std::string IRPropertyToString(IRProperty prop) {
       return "VectorKernelSplit";
     case IRProperty::OutParamNotShadowed:
       return "OutParamNotShadowed";
-    case IRProperty::NoNestedInCore:
-      return "NoNestedInCore";
     case IRProperty::InOutUseValid:
       return "InOutUseValid";
     default:
@@ -132,8 +128,7 @@ VerificationLevel GetDefaultVerificationLevel() {
 const IRPropertySet& GetStructuralProperties() {
   static const IRPropertySet props{IRProperty::TypeChecked,         IRProperty::BreakContinueValid,
                                    IRProperty::NoRedundantBlocks,   IRProperty::UseAfterDef,
-                                   IRProperty::OutParamNotShadowed, IRProperty::NoNestedInCore,
-                                   IRProperty::InOutUseValid};
+                                   IRProperty::OutParamNotShadowed, IRProperty::InOutUseValid};
   return props;
 }
 
@@ -144,8 +139,7 @@ const IRPropertySet& GetDefaultVerifyProperties() {
                                    IRProperty::BreakContinueValid,
                                    IRProperty::NoRedundantBlocks,
                                    IRProperty::UseAfterDef,
-                                   IRProperty::OutParamNotShadowed,
-                                   IRProperty::NoNestedInCore};
+                                   IRProperty::OutParamNotShadowed};
   return props;
 }
 
diff --git a/src/ir/transforms/legalize_pto_buffer_reuse_pass.cpp b/src/ir/transforms/legalize_pto_buffer_reuse_pass.cpp
index c46cdc99c..7e257df48 100644
--- a/src/ir/transforms/legalize_pto_buffer_reuse_pass.cpp
+++ b/src/ir/transforms/legalize_pto_buffer_reuse_pass.cpp
@@ -477,7 +477,7 @@ FunctionPtr TransformLegalizePTOBufferReuse(const FunctionPtr& func) {
 namespace pass {
 
 Pass LegalizePTOBufferReuse() {
-  static const PassProperties kProps{.required = {IRProperty::SplitIncoreOrch, IRProperty::IncoreTileOps,
+  static const PassProperties kProps{.required = {IRProperty::HierarchyOutlined, IRProperty::IncoreTileOps,
                                                   IRProperty::HasMemRefs, IRProperty::TileOps2D}};
   return CreateFunctionPass(TransformLegalizePTOBufferReuse, "LegalizePTOBufferReuse", kProps);
 }
diff --git a/src/ir/transforms/memory_reuse_pass.cpp b/src/ir/transforms/memory_reuse_pass.cpp
index 1f863f9b2..74236254a 100644
--- a/src/ir/transforms/memory_reuse_pass.cpp
+++ b/src/ir/transforms/memory_reuse_pass.cpp
@@ -118,8 +118,6 @@ class LifetimeAnalyzer : public IRVisitor {
     }
   }
 
-  void VisitStmt_(const InCoreScopeStmtPtr& op) override { VisitStmt(op->body_); }
-  void VisitStmt_(const AutoInCoreScopeStmtPtr& op) override { VisitStmt(op->body_); }
   void VisitStmt_(const ClusterScopeStmtPtr& op) override { VisitStmt(op->body_); }
   void VisitStmt_(const HierarchyScopeStmtPtr& op) override { VisitStmt(op->body_); }
   void VisitStmt_(const SpmdScopeStmtPtr& op) override { VisitStmt(op->body_); }
diff --git a/src/ir/transforms/mutator.cpp b/src/ir/transforms/mutator.cpp
index b6faa550f..5ffab9c12 100644
--- a/src/ir/transforms/mutator.cpp
+++ b/src/ir/transforms/mutator.cpp
@@ -589,30 +589,6 @@ StmtPtr IRMutator::VisitStmt_(const WhileStmtPtr& op) {
   return op;
 }
 
-StmtPtr IRMutator::VisitStmt_(const InCoreScopeStmtPtr& op) {
-  INTERNAL_CHECK_SPAN(op->body_, op->span_) << "InCoreScopeStmt has null body";
-  auto new_body = StmtFunctor<StmtPtr>::VisitStmt(op->body_);
-  INTERNAL_CHECK_SPAN(new_body, op->span_) << "InCoreScopeStmt body mutated to null";
-  if (new_body.get() != op->body_.get()) {
-    auto result = MutableCopy(op);
-    result->body_ = std::move(new_body);
-    return result;
-  }
-  return op;
-}
-
-StmtPtr IRMutator::VisitStmt_(const AutoInCoreScopeStmtPtr& op) {
-  INTERNAL_CHECK_SPAN(op->body_, op->span_) << "AutoInCoreScopeStmt has null body";
-  auto new_body = StmtFunctor<StmtPtr>::VisitStmt(op->body_);
-  INTERNAL_CHECK_SPAN(new_body, op->span_) << "AutoInCoreScopeStmt body mutated to null";
-  if (new_body.get() != op->body_.get()) {
-    auto result = MutableCopy(op);
-    result->body_ = std::move(new_body);
-    return result;
-  }
-  return op;
-}
-
 StmtPtr IRMutator::VisitStmt_(const ClusterScopeStmtPtr& op) {
   INTERNAL_CHECK_SPAN(op->body_, op->span_) << "ClusterScopeStmt has null body";
   auto new_body = StmtFunctor<StmtPtr>::VisitStmt(op->body_);
diff --git a/src/ir/transforms/outline_hierarchy_scopes_pass.cpp b/src/ir/transforms/outline_hierarchy_scopes_pass.cpp
index 4276224d0..c5b37b4d6 100644
--- a/src/ir/transforms/outline_hierarchy_scopes_pass.cpp
+++ b/src/ir/transforms/outline_hierarchy_scopes_pass.cpp
@@ -29,11 +29,14 @@ namespace ir {
 namespace pass {
 
 /**
- * @brief Pass to outline Hierarchy scopes into separate functions with level/role
+ * @brief Pass to outline non-CORE_GROUP Hierarchy scopes into separate functions.
  *
- * This pass transforms ScopeStmt(Hierarchy) nodes into separate Function definitions
- * that carry the scope's Level and Role metadata, and replaces the scope with a Call
- * to the outlined function.
+ * This pass transforms HierarchyScopeStmt nodes whose `level_` is anything other
+ * than `Level::CORE_GROUP` into separate Function definitions that carry the
+ * scope's Level/Role metadata, and replaces the scope with a Call to the outlined
+ * function. CORE_GROUP scopes are intentionally left intact for the subsequent
+ * `OutlineIncoreScopes` pass, which emits `Function(InCore)` and promotes the
+ * parent function from `Opaque` to `Orchestration`.
  *
  * Requirements:
  * - Input IR must be in SSA form (run ConvertToSSA first)
@@ -41,14 +44,13 @@ namespace pass {
  * - Should run before OutlineIncoreScopes and OutlineClusterScopes
  *
  * Transformation:
- * 1. For each ScopeStmt(Hierarchy) in an Opaque function:
- *    - Analyze body to determine external variable references (inputs)
- *    - Analyze subsequent statements to determine which definitions are outputs
- *    - Extract body into new Function(Opaque, level, role) with appropriate params/returns
- *    - Replace scope with Call to the outlined function + output assignments
- * 2. Recursively handles nested Hierarchy scopes
- * 3. Add outlined functions to the program
- * 4. Parent function type is preserved (not promoted)
+ * 1. For each HierarchyScopeStmt at level != CORE_GROUP in an Opaque function:
+ *    - Analyze body for inputs/outputs
+ *    - Extract body into a new Opaque Function carrying the scope's level/role
+ *    - Replace the scope with a Call to the outlined function + output assignments
+ * 2. Recursively descends into other scopes; nested non-CORE_GROUP Hierarchy
+ *    scopes are outlined together with their parent.
+ * 3. CORE_GROUP scopes (and their bodies) are preserved verbatim.
  */
 Pass OutlineHierarchyScopes() {
   auto pass_func = [](const ProgramPtr& program) -> ProgramPtr {
@@ -71,15 +73,20 @@ Pass OutlineHierarchyScopes() {
       }
       type_collector.VisitStmt(func->body_);
 
-      // Outline Hierarchy scopes in this function
+      // Outline non-CORE_GROUP Hierarchy scopes; CORE_GROUP scopes are skipped
+      // and handled by OutlineIncoreScopes downstream.
+      outline_utils::ScopeOutliner::HierarchyLevelFilter filter{
+          Level::CORE_GROUP, outline_utils::ScopeOutliner::HierarchyLevelFilter::Mode::Exclude};
       outline_utils::ScopeOutliner outliner(func->name_, type_collector.var_types, type_collector.var_objects,
                                             type_collector.known_names, ScopeKind::Hierarchy,
-                                            FunctionType::Opaque, "_hierarchy_");
+                                            /*outlined_func_type=*/FunctionType::Opaque, "_hierarchy_",
+                                            /*program=*/nullptr, filter);
       auto new_body = outliner.VisitStmt(func->body_);
 
-      // Preserve parent function type (don't promote — hierarchy is orthogonal to FunctionType)
       auto new_func = MutableCopy(func);
       new_func->body_ = new_body;
+      // Parent type unchanged; CORE_GROUP-driven promotion to Orchestration
+      // happens in OutlineIncoreScopes.
       new_functions.push_back(new_func);
 
       const auto& outlined = outliner.GetOutlinedFunctions();
@@ -101,6 +108,10 @@ Pass OutlineHierarchyScopes() {
 // ============================================================================
 // HierarchyOutlined property verifier
 // ============================================================================
+//
+// This verifier is shared between OutlineHierarchyScopes and OutlineIncoreScopes.
+// The HierarchyOutlined property is produced by OutlineIncoreScopes (which runs
+// after OutlineHierarchyScopes), since CORE_GROUP scopes survive the first pass.
 
 namespace {
 
@@ -116,9 +127,13 @@ class HierarchyOutlinedPropertyVerifierImpl : public PropertyVerifier {
     if (!program) return;
     for (const auto& [gv, func] : program->functions_) {
       if (!func || !func->body_) continue;
-      // Only check Opaque functions — the pass only processes Opaque functions,
-      // so Hierarchy scopes in other function types are not expected to be outlined.
-      if (func->func_type_ != FunctionType::Opaque) continue;
+      // After both outline passes have run, no Hierarchy scopes should remain in
+      // Opaque/Orchestration functions. Inside InCore/Group/Spmd outlined
+      // functions, Hierarchy scopes are disallowed by construction (the outliner
+      // only produces leaf scope bodies).
+      if (func->func_type_ != FunctionType::Opaque && func->func_type_ != FunctionType::Orchestration) {
+        continue;
+      }
       HierarchyOutlinedVerifier verifier(diagnostics, "HierarchyOutlined",
                                          "Hierarchy ScopeStmt found in function (should have been outlined)");
       verifier.VisitStmt(func->body_);
diff --git a/src/ir/transforms/outline_incore_scopes_pass.cpp b/src/ir/transforms/outline_incore_scopes_pass.cpp
index 07028f048..ea7dfe956 100644
--- a/src/ir/transforms/outline_incore_scopes_pass.cpp
+++ b/src/ir/transforms/outline_incore_scopes_pass.cpp
@@ -10,21 +10,16 @@
  */
 
 #include <memory>
-#include <string>
-#include <unordered_set>
 #include <vector>
 
-#include "pypto/core/error.h"
-#include "pypto/ir/expr.h"
 #include "pypto/ir/function.h"
 #include "pypto/ir/program.h"
 #include "pypto/ir/stmt.h"
+#include "pypto/ir/transforms/base/visitor.h"
 #include "pypto/ir/transforms/pass_properties.h"
 #include "pypto/ir/transforms/passes.h"
 #include "pypto/ir/transforms/utils/mutable_copy.h"
 #include "pypto/ir/transforms/utils/scope_outline_utils.h"
-#include "pypto/ir/transforms/utils/transform_utils.h"
-#include "pypto/ir/verifier/verifier.h"
 
 namespace pypto {
 namespace ir {
@@ -32,38 +27,64 @@ namespace ir {
 namespace pass {
 
 /**
- * @brief Pass to outline InCore scopes into separate functions
+ * @brief Pass to outline CORE_GROUP Hierarchy scopes into InCore functions.
  *
- * This pass transforms ScopeStmt(InCore) nodes into separate Function(InCore) definitions
- * and replaces the scope with a Call to the outlined function.
+ * This pass picks up where OutlineHierarchyScopes leaves off: it transforms
+ * every `HierarchyScopeStmt(level=CORE_GROUP)` that survived the previous pass
+ * into a separate `Function(InCore)` definition and replaces the scope with a
+ * `Call` to that function. When any CORE_GROUP scope is outlined out of an
+ * `Opaque` function, the parent function is promoted from `Opaque` to
+ * `Orchestration` so downstream tile-level passes see the canonical
+ * Orchestration → InCore call shape.
  *
  * Requirements:
  * - Input IR must be in SSA form (run ConvertToSSA first)
- * - Only processes Opaque functions (InCore functions are left unchanged)
+ * - Should run after OutlineHierarchyScopes and before OutlineClusterScopes
+ * - Only processes Opaque functions
  *
- * Transformation:
- * 1. For each ScopeStmt(InCore) in an Opaque function:
- *    - Analyze body to determine external variable references (inputs)
- *    - Analyze subsequent statements to determine which definitions are outputs
- *    - Extract body into new Function(InCore) with appropriate params/returns
- *    - Replace scope with Call to the outlined function + output assignments
- *    - EvalStmt(store) calls on output tensors are converted to AssignStmt
- * 2. Recursively handles nested InCore scopes
- * 3. Add outlined functions to the program
- * 4. Promote the parent function from Opaque to Orchestration
+ * Together with OutlineHierarchyScopes this pass establishes the
+ * `HierarchyOutlined` property: after both have run, no `HierarchyScopeStmt`
+ * remains in any Opaque/Orchestration function body.
  */
+namespace {
+
+/// Returns true iff any HierarchyScopeStmt at Level::CORE_GROUP appears under
+/// the given statement. Used to decide whether to promote the parent function
+/// from Opaque to Orchestration.
+class CoreGroupHierarchyFinder : public IRVisitor {
+ public:
+  bool found = false;
+
+ protected:
+  void VisitStmt_(const HierarchyScopeStmtPtr& op) override {
+    if (op->level_ == Level::CORE_GROUP) {
+      found = true;
+    }
+    IRVisitor::VisitStmt_(op);
+  }
+};
+
+}  // namespace
+
 Pass OutlineIncoreScopes() {
   auto pass_func = [](const ProgramPtr& program) -> ProgramPtr {
     std::vector<FunctionPtr> new_functions;
     std::vector<FunctionPtr> all_outlined_functions;
 
     for (const auto& [gvar, func] : program->functions_) {
-      // Only process Opaque functions (InCore functions are already outlined)
+      // Only Opaque functions can carry CORE_GROUP HierarchyScopeStmts at this
+      // point in the pipeline.
       if (func->func_type_ != FunctionType::Opaque) {
         new_functions.push_back(func);
         continue;
       }
 
+      // Detect CORE_GROUP scopes before outlining; outliner.GetOutlinedFunctions()
+      // tells us *what* was outlined, but we need the parent-promotion decision
+      // up front so it is symmetric with future filters.
+      CoreGroupHierarchyFinder finder;
+      finder.VisitStmt(func->body_);
+
       // Build symbol table for this function
       outline_utils::VarCollector type_collector;
       for (const auto& var : func->params_) {
@@ -73,29 +94,31 @@ Pass OutlineIncoreScopes() {
       }
       type_collector.VisitStmt(func->body_);
 
-      // Outline InCore scopes in this function
+      // Outline only HierarchyScopeStmts at CORE_GROUP into InCore functions.
+      outline_utils::ScopeOutliner::HierarchyLevelFilter filter{
+          Level::CORE_GROUP, outline_utils::ScopeOutliner::HierarchyLevelFilter::Mode::Only};
       outline_utils::ScopeOutliner outliner(func->name_, type_collector.var_types, type_collector.var_objects,
-                                            type_collector.known_names, ScopeKind::InCore,
-                                            FunctionType::InCore, "_incore_");
+                                            type_collector.known_names, ScopeKind::Hierarchy,
+                                            /*outlined_func_type=*/FunctionType::InCore, "_incore_",
+                                            /*program=*/nullptr, filter);
       auto new_body = outliner.VisitStmt(func->body_);
 
-      // Create new function with transformed body.
-      // If any InCore scopes were outlined, promote Opaque -> Orchestration.
-      const auto& outlined = outliner.GetOutlinedFunctions();
-      FunctionType new_func_type = outlined.empty() ? func->func_type_ : FunctionType::Orchestration;
       auto new_func = MutableCopy(func);
       new_func->body_ = new_body;
-      new_func->func_type_ = new_func_type;
+      if (finder.found) {
+        // Promote parent Opaque → Orchestration whenever any CORE_GROUP scope
+        // was outlined, matching the contract the former OutlineIncoreScopes
+        // (driven by InCoreScopeStmt) used to satisfy.
+        new_func->func_type_ = FunctionType::Orchestration;
+      }
       new_functions.push_back(new_func);
 
-      // Collect outlined functions (prepend before parent so inner functions come first)
+      const auto& outlined = outliner.GetOutlinedFunctions();
       all_outlined_functions.insert(all_outlined_functions.end(), outlined.begin(), outlined.end());
     }
 
-    // Add all outlined functions before the originals
+    // Outlined functions go before the originals so call sites can reference them.
     all_outlined_functions.insert(all_outlined_functions.end(), new_functions.begin(), new_functions.end());
-
-    // Create new program with all functions
     return std::make_shared<Program>(all_outlined_functions, program->name_, program->span_);
   };
 
@@ -103,69 +126,5 @@ Pass OutlineIncoreScopes() {
 }
 
 }  // namespace pass
-
-// ============================================================================
-// SplitIncoreOrch property verifier
-// ============================================================================
-
-namespace {
-
-/**
- * @brief Checks no InCore ScopeStmts remain in Opaque or Orchestration functions.
- */
-using SplitIncoreOrchVerifier = outline_utils::ScopeKindAbsenceVerifier<ScopeKind::InCore>;
-
-static bool IsComputeTensorOp(const std::string& op_name) {
-  return transform_utils::IsComputeTensorOp(op_name);
-}
-
-/// Checks Orchestration functions for compute tensor ops that should be in InCore.
-class OrchComputeTensorOpVerifier : public IRVisitor {
- public:
-  explicit OrchComputeTensorOpVerifier(std::vector<Diagnostic>& diagnostics) : diagnostics_(diagnostics) {}
-
-  void VisitExpr_(const CallPtr& op) override {
-    if (op && op->op_ && IsComputeTensorOp(op->op_->name_)) {
-      diagnostics_.emplace_back(DiagnosticSeverity::Warning, "SplitIncoreOrch", 0,
-                                "Compute tensor op '" + op->op_->name_ +
-                                    "' found in Orchestration function (should be inside InCore)",
-                                op->span_);
-    }
-    IRVisitor::VisitExpr_(op);
-  }
-
- private:
-  std::vector<Diagnostic>& diagnostics_;
-};
-
-}  // namespace
-
-class SplitIncoreOrchPropertyVerifierImpl : public PropertyVerifier {
- public:
-  [[nodiscard]] std::string GetName() const override { return "SplitIncoreOrch"; }
-
-  void Verify(const ProgramPtr& program, std::vector<Diagnostic>& diagnostics) override {
-    if (!program) return;
-    for (const auto& [gv, func] : program->functions_) {
-      if (!func || !func->body_) continue;
-      // Check Opaque and Orchestration functions — InCore functions are expected to have InCore content
-      if (func->func_type_ == FunctionType::InCore) continue;
-      SplitIncoreOrchVerifier verifier(
-          diagnostics, "SplitIncoreOrch",
-          "InCore ScopeStmt found in non-InCore function (should have been outlined)");
-      verifier.VisitStmt(func->body_);
-      // Also check Orchestration functions for leaked compute tensor ops
-      if (func->func_type_ == FunctionType::Orchestration) {
-        OrchComputeTensorOpVerifier compute_verifier(diagnostics);
-        compute_verifier.VisitStmt(func->body_);
-      }
-    }
-  }
-};
-
-PropertyVerifierPtr CreateSplitIncoreOrchPropertyVerifier() {
-  return std::make_shared<SplitIncoreOrchPropertyVerifierImpl>();
-}
-
 }  // namespace ir
 }  // namespace pypto
diff --git a/src/ir/transforms/python_printer.cpp b/src/ir/transforms/python_printer.cpp
index 47505cc30..af577cd14 100644
--- a/src/ir/transforms/python_printer.cpp
+++ b/src/ir/transforms/python_printer.cpp
@@ -239,8 +239,6 @@ class IRPythonPrinter : public IRVisitor {
   void VisitStmt_(const ReturnStmtPtr& op) override;
   void VisitStmt_(const ForStmtPtr& op) override;
   void VisitStmt_(const WhileStmtPtr& op) override;
-  void VisitStmt_(const InCoreScopeStmtPtr& op) override;
-  void VisitStmt_(const AutoInCoreScopeStmtPtr& op) override;
   void VisitStmt_(const ClusterScopeStmtPtr& op) override;
   void VisitStmt_(const HierarchyScopeStmtPtr& op) override;
   void VisitStmt_(const SpmdScopeStmtPtr& op) override;
@@ -912,9 +910,9 @@ void IRPythonPrinter::VisitStmt_(const ForStmtPtr& op) {
     VisitExpr(op->step_);
   }
 
-  // Unroll loops cannot have iter_args. The DSL parser forbids init_values for
-  // pl.unroll(), and SplitChunkedLoops preserves this: chunk-split unroll loops
-  // always take the simple (no iter_args) path.
+  // Unroll loops cannot have iter_args. The DSL parser forbids init_values
+  // for pl.unroll(), and no built-in pass produces an unroll loop with
+  // iter_args; printers that hit this branch indicate a malformed IR.
   if (op->kind_ == ForKind::Unroll && !op->iter_args_.empty()) {
     INTERNAL_CHECK_SPAN(false, op->span_) << "ForKind::Unroll does not support iter_args/init_values";
   }
@@ -1029,41 +1027,14 @@ void IRPythonPrinter::VisitStmt_(const WhileStmtPtr& op) {
 }
 
 void IRPythonPrinter::VisitStmt_(const HierarchyScopeStmtPtr& op) {
-  // Print as: with pl.at(level=pl.Level.X, role=pl.Role.Y, [name_hint="..."]):
+  // Print as: with pl.at(level=pl.Level.X, [role=...], [optimizations=[pl.split(...)]], [name_hint=...]):
   stream_ << "with " << prefix_ << ".at(level=" << prefix_ << ".Level." << LevelToString(op->level_);
   if (op->role_.has_value()) {
     stream_ << ", role=" << prefix_ << ".Role." << RoleToString(*op->role_);
   }
-  if (!op->name_hint_.empty()) {
-    stream_ << ", name_hint=\"" << op->name_hint_ << "\"";
-  }
-  stream_ << "):\n";
-  IncreaseIndent();
-  PrintStmtBlock(op->body_);
-  DecreaseIndent();
-}
-
-void IRPythonPrinter::VisitStmt_(const InCoreScopeStmtPtr& op) {
-  stream_ << "with " << prefix_ << ".at(level=" << prefix_ << ".Level.CORE_GROUP";
   if (op->split_.has_value() && op->split_.value() != SplitMode::None) {
-    stream_ << ", split=" << prefix_ << ".SplitMode." << SplitModeToPythonString(op->split_.value());
-  }
-  if (!op->name_hint_.empty()) {
-    stream_ << ", name_hint=\"" << op->name_hint_ << "\"";
-  }
-  stream_ << "):\n";
-  IncreaseIndent();
-  PrintStmtBlock(op->body_);
-  DecreaseIndent();
-}
-
-void IRPythonPrinter::VisitStmt_(const AutoInCoreScopeStmtPtr& op) {
-  stream_ << "with " << prefix_ << ".at(level=" << prefix_ << ".Level.CORE_GROUP, optimization=";
-  if (op->split_.has_value() && op->split_.value() != SplitMode::None) {
-    stream_ << prefix_ << ".chunked_loop_optimizer(split=" << prefix_ << ".SplitMode."
-            << SplitModeToPythonString(op->split_.value()) << ")";
-  } else {
-    stream_ << prefix_ << ".chunked_loop_optimizer";
+    stream_ << ", optimizations=[" << prefix_ << ".split(" << prefix_ << ".SplitMode."
+            << SplitModeToPythonString(op->split_.value()) << ")]";
   }
   if (!op->name_hint_.empty()) {
     stream_ << ", name_hint=\"" << op->name_hint_ << "\"";
diff --git a/src/ir/transforms/split_chunked_loops_pass.cpp b/src/ir/transforms/split_chunked_loops_pass.cpp
deleted file mode 100644
index 074338f34..000000000
--- a/src/ir/transforms/split_chunked_loops_pass.cpp
+++ /dev/null
@@ -1,832 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-
-#include <any>
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <optional>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "pypto/core/dtype.h"
-#include "pypto/core/error.h"
-#include "pypto/core/logging.h"
-#include "pypto/ir/core.h"
-#include "pypto/ir/expr.h"
-#include "pypto/ir/function.h"
-#include "pypto/ir/scalar_expr.h"
-#include "pypto/ir/span.h"
-#include "pypto/ir/stmt.h"
-#include "pypto/ir/transforms/base/mutator.h"
-#include "pypto/ir/transforms/pass_properties.h"
-#include "pypto/ir/transforms/passes.h"
-#include "pypto/ir/transforms/utils/auto_name_utils.h"
-#include "pypto/ir/transforms/utils/mutable_copy.h"
-#include "pypto/ir/transforms/utils/transform_utils.h"
-#include "pypto/ir/type.h"
-
-namespace pypto {
-namespace ir {
-
-using Attrs = std::vector<std::pair<std::string, std::any>>;
-using transform_utils::CollectDefVars;
-
-namespace {
-
-/// Build attrs for a generated loop: copy original attrs (excluding loop_origin) and set the new origin.
-Attrs MakeLoopAttrs(const Attrs& original_attrs, LoopOrigin origin) {
-  Attrs result;
-  for (const auto& [key, value] : original_attrs) {
-    if (key != "loop_origin") result.emplace_back(key, value);
-  }
-  result.emplace_back("loop_origin", origin);
-  return result;
-}
-
-/**
- * @brief Try to extract a compile-time integer from a ConstInt or Neg(ConstInt).
- * @return The integer value, or std::nullopt if not a compile-time constant.
- */
-static std::optional<int64_t> TryGetConstInt(const ExprPtr& expr) {
-  auto ci = std::dynamic_pointer_cast<const ConstInt>(expr);
-  if (ci) {
-    return ci->value_;
-  }
-  auto neg = std::dynamic_pointer_cast<const Neg>(expr);
-  if (neg) {
-    auto inner = std::dynamic_pointer_cast<const ConstInt>(neg->operand_);
-    if (inner) {
-      return -inner->value_;
-    }
-  }
-  return std::nullopt;
-}
-
-/**
- * @brief Extract a compile-time integer value from a ConstInt or Neg(ConstInt) expression.
- */
-static int64_t GetConstIntValue(const ExprPtr& expr, const std::string& what) {
-  auto val = TryGetConstInt(expr);
-  if (val.has_value()) {
-    return *val;
-  }
-  throw pypto::ValueError("Chunked loop " + what + " must be a compile-time integer constant, got " +
-                          expr->TypeName());
-}
-
-/**
- * @brief Create a ConstInt expression with INDEX dtype.
- */
-static ExprPtr MakeConstIndex(int64_t value, const Span& span) {
-  return std::make_shared<ConstInt>(value, DataType::INDEX, span);
-}
-
-/**
- * @brief Compute trip count from compile-time constant bounds.
- */
-static int64_t ComputeStaticTripCount(int64_t start, int64_t stop, int64_t step) {
-  if (step > 0 && start < stop) {
-    return (stop - start + step - 1) / step;
-  }
-  if (step < 0 && start > stop) {
-    return (start - stop + (-step) - 1) / (-step);
-  }
-  return 0;
-}
-
-/**
- * @brief Build trip count as an expression tree for dynamic bounds.
- *
- * Produces: max(ceildiv(stop - start, step), 0)  when step > 0
- *           max(ceildiv(start - stop, -step), 0) when step < 0
- */
-static ExprPtr BuildTripCountExpr(const ExprPtr& start, const ExprPtr& stop, int64_t step, const Span& sp) {
-  ExprPtr trip_count;
-  if (step > 0) {
-    ExprPtr range_size = MakeSub(stop, start, sp);
-    if (step == 1) {
-      trip_count = range_size;
-    } else {
-      trip_count =
-          MakeFloorDiv(MakeAdd(range_size, MakeConstIndex(step - 1, sp), sp), MakeConstIndex(step, sp), sp);
-    }
-  } else {
-    ExprPtr range_size = MakeSub(start, stop, sp);
-    int64_t abs_step = -step;
-    if (abs_step == 1) {
-      trip_count = range_size;
-    } else {
-      trip_count = MakeFloorDiv(MakeAdd(range_size, MakeConstIndex(abs_step - 1, sp), sp),
-                                MakeConstIndex(abs_step, sp), sp);
-    }
-  }
-  return MakeMax(trip_count, MakeConstIndex(0, sp), sp);
-}
-
-static void CollectDeclaredNames(const StmtPtr& stmt, std::unordered_set<std::string>& result) {
-  if (!stmt) return;
-
-  auto kind = stmt->GetKind();
-  switch (kind) {
-    case ObjectKind::AssignStmt: {
-      auto assign = std::static_pointer_cast<const AssignStmt>(stmt);
-      result.insert(assign->var_->name_hint_);
-      break;
-    }
-    case ObjectKind::ForStmt: {
-      auto for_stmt = std::static_pointer_cast<const ForStmt>(stmt);
-      result.insert(for_stmt->loop_var_->name_hint_);
-      for (const auto& ia : for_stmt->iter_args_) result.insert(ia->name_hint_);
-      for (const auto& rv : for_stmt->return_vars_) result.insert(rv->name_hint_);
-      CollectDeclaredNames(for_stmt->body_, result);
-      break;
-    }
-    case ObjectKind::WhileStmt: {
-      auto while_stmt = std::static_pointer_cast<const WhileStmt>(stmt);
-      for (const auto& ia : while_stmt->iter_args_) result.insert(ia->name_hint_);
-      for (const auto& rv : while_stmt->return_vars_) result.insert(rv->name_hint_);
-      CollectDeclaredNames(while_stmt->body_, result);
-      break;
-    }
-    case ObjectKind::IfStmt: {
-      auto if_stmt = std::static_pointer_cast<const IfStmt>(stmt);
-      for (const auto& rv : if_stmt->return_vars_) result.insert(rv->name_hint_);
-      CollectDeclaredNames(if_stmt->then_body_, result);
-      if (if_stmt->else_body_.has_value()) {
-        CollectDeclaredNames(*if_stmt->else_body_, result);
-      }
-      break;
-    }
-    case ObjectKind::SeqStmts: {
-      auto seq = std::static_pointer_cast<const SeqStmts>(stmt);
-      for (const auto& s : seq->stmts_) {
-        CollectDeclaredNames(s, result);
-      }
-      break;
-    }
-    case ObjectKind::InCoreScopeStmt:
-    case ObjectKind::AutoInCoreScopeStmt:
-    case ObjectKind::ClusterScopeStmt:
-    case ObjectKind::HierarchyScopeStmt:
-    case ObjectKind::SpmdScopeStmt: {
-      auto scope = std::static_pointer_cast<const ScopeStmt>(stmt);
-      CollectDeclaredNames(scope->body_, result);
-      break;
-    }
-    default:
-      break;
-  }
-}
-
-/**
- * @brief Convert a vector of statements into a single StmtPtr.
- *
- * Returns an empty SeqStmts for empty input, the single statement for
- * size==1, or a SeqStmts wrapping multiple statements.
- */
-static StmtPtr MakeResultStmt(const std::vector<StmtPtr>& stmts, const Span& span) {
-  return SeqStmts::Flatten(std::vector<StmtPtr>(stmts), span);
-}
-
-/**
- * @brief Mutator that splits ForStmt nodes with chunk_config_ into nested loops.
- *
- * Runs after SSA conversion. Propagates iter_args through generated loops.
- * Handles both compile-time constant and dynamic (runtime) loop bounds.
- *
- * Transforms (SSA form):
- *   for i, (x_iter=x_0,) in range(start, stop, step, chunk=C) -> (x_rv,):
- *     x_1 = add(x_iter, 1.0)
- *     yield(x_1)
- *
- * Into:
- *   for i_out, (x_outer=x_0,) in range(0, n_full) -> (x_outer_rv,):
- *     for i_in, (x_inner=x_outer,) in range(0, C) -> (x_inner_rv,):
- *       x_1 = add(x_inner, 1.0)
- *       yield(x_1)
- *     yield(x_inner_rv)
- *   # optional remainder
- *   for i_rem, (x_rem=x_outer_rv,) in range(0, n_rem) -> (x_rem_rv,):
- *     x_1_f = add(x_rem, 1.0)   (fresh DEF variable)
- *     yield(x_1_f)
- *   return uses x_rem_rv (or x_outer_rv if no remainder)
- *
- * Where n_full and n_rem are ExprPtr — either ConstInt (when bounds are
- * compile-time constants) or FloorDiv/FloorMod expressions (when dynamic).
- */
-class ChunkedLoopSplitter : public IRMutator {
- public:
-  void SeedUsedNames(const FunctionPtr& func) {
-    function_used_names_.clear();
-    for (const auto& param : func->params_) {
-      if (param) {
-        function_used_names_.insert(param->name_hint_);
-      }
-    }
-    CollectDeclaredNames(func->body_, function_used_names_);
-  }
-
-  StmtPtr VisitStmt_(const AutoInCoreScopeStmtPtr& op) override {
-    bool prev = inside_auto_incore_;
-    inside_auto_incore_ = true;
-    auto new_body = VisitStmt(op->body_);
-    inside_auto_incore_ = prev;
-    if (new_body.get() == op->body_.get()) {
-      return op;
-    }
-    auto new_scope = MutableCopy(op);
-    new_scope->body_ = std::move(new_body);
-    return new_scope;
-  }
-
-  ExprPtr VisitExpr_(const VarPtr& op) override {
-    auto sub_it = substitution_map_.find(op.get());
-    if (sub_it != substitution_map_.end()) {
-      return sub_it->second;
-    }
-    return op;
-  }
-
-  ExprPtr VisitExpr_(const IterArgPtr& op) override {
-    auto sub_it = substitution_map_.find(op.get());
-    if (sub_it != substitution_map_.end()) {
-      return sub_it->second;
-    }
-    return IRMutator::VisitExpr_(op);
-  }
-
-  StmtPtr VisitStmt_(const ForStmtPtr& op) override {
-    if (!op->chunk_config_.has_value() || !inside_auto_incore_) {
-      return IRMutator::VisitStmt_(op);
-    }
-
-    // chunk_size and step must always be compile-time constants
-    int64_t chunk_size = GetConstIntValue(op->chunk_config_->size, "chunk_size");
-    int64_t step = GetConstIntValue(op->step_, "step");
-    CHECK(step != 0) << "Chunked loop step cannot be zero";
-    CHECK(chunk_size > 0) << "Chunk size must be positive, got " << chunk_size;
-
-    Span sp = op->span_;
-    auto step_expr = MakeConstIndex(step, sp);
-    auto chunk_expr = MakeConstIndex(chunk_size, sp);
-
-    ExprPtr start_expr = VisitExpr(op->start_);
-    ExprPtr stop_expr = VisitExpr(op->stop_);
-
-    const Var* loop_var_key = op->loop_var_.get();
-    auto loop_name = auto_name::Parse(op->loop_var_->name_hint_);
-    std::string base_name = loop_name.base_name;
-
-    auto prev_loop_sub = SaveSubstitution(loop_var_key);
-    std::vector<SavedSubstitution> prev_ia_subs;
-    for (const auto& ia : op->iter_args_) {
-      prev_ia_subs.push_back(SaveSubstitution(ia.get()));
-    }
-
-    bool has_iter_args = !op->iter_args_.empty();
-    ChunkPolicy policy = op->chunk_config_->policy;
-
-    if (policy == ChunkPolicy::LeadingFull) {
-      // Compute n_full and n_rem as ExprPtr.
-      ExprPtr n_full;
-      ExprPtr n_rem;
-      auto start_c = TryGetConstInt(start_expr);
-      auto stop_c = TryGetConstInt(stop_expr);
-      if (start_c && stop_c) {
-        int64_t tc = ComputeStaticTripCount(*start_c, *stop_c, step);
-        n_full = MakeConstIndex(tc / chunk_size, sp);
-        n_rem = MakeConstIndex(tc % chunk_size, sp);
-      } else {
-        ExprPtr trip_count = BuildTripCountExpr(start_expr, stop_expr, step, sp);
-        n_full = MakeFloorDiv(trip_count, chunk_expr, sp);
-        n_rem = MakeFloorMod(trip_count, chunk_expr, sp);
-      }
-
-      auto n_full_c = TryGetConstInt(n_full);
-      auto n_rem_c = TryGetConstInt(n_rem);
-      bool emit_full = !n_full_c || *n_full_c > 0;
-      bool emit_rem = !n_rem_c || *n_rem_c > 0;
-
-      if (!has_iter_args) {
-        return SplitLeadingFull(op, loop_var_key, base_name, loop_name.version, start_expr, step_expr,
-                                chunk_expr, n_full, n_rem, emit_full, emit_rem, prev_loop_sub, sp);
-      }
-
-      // Zero-trip optimization: when statically known, skip loop emission entirely
-      if (n_full_c && n_rem_c && *n_full_c == 0 && *n_rem_c == 0) {
-        INTERNAL_CHECK_SPAN(op->return_vars_.size() == op->iter_args_.size(), op->span_)
-            << "ForStmt return_vars/iter_args size mismatch in zero-trip chunk split";
-        for (size_t i = 0; i < op->return_vars_.size(); ++i) {
-          substitution_map_[op->return_vars_[i].get()] = VisitExpr(op->iter_args_[i]->initValue_);
-        }
-        RestoreSubstitution(prev_loop_sub);
-        RestoreSubstitutions(prev_ia_subs);
-        return SeqStmts::Flatten(std::vector<StmtPtr>{}, sp);
-      }
-
-      return SplitLeadingFullWithIterArgs(op, loop_var_key, base_name, loop_name.version, start_expr,
-                                          step_expr, chunk_expr, n_full, n_rem, emit_full, emit_rem,
-                                          prev_loop_sub, prev_ia_subs, sp);
-    }
-
-    INTERNAL_CHECK_SPAN(policy == ChunkPolicy::Guarded, op->span_)
-        << "Unexpected ChunkPolicy in SplitChunkedLoops: " << ChunkPolicyToString(policy);
-
-    // Compute n_total = ceil(trip_count / chunk_size).
-    ExprPtr n_total;
-    auto start_c = TryGetConstInt(start_expr);
-    auto stop_c = TryGetConstInt(stop_expr);
-    if (start_c && stop_c) {
-      int64_t tc = ComputeStaticTripCount(*start_c, *stop_c, step);
-      int64_t nt = (tc + chunk_size - 1) / chunk_size;
-      n_total = MakeConstIndex(nt, sp);
-    } else {
-      ExprPtr trip_count = BuildTripCountExpr(start_expr, stop_expr, step, sp);
-      ExprPtr numerator = MakeAdd(trip_count, MakeConstIndex(chunk_size - 1, sp), sp);
-      n_total = MakeFloorDiv(numerator, chunk_expr, sp);
-    }
-
-    auto n_total_c = TryGetConstInt(n_total);
-    bool emit = !n_total_c || *n_total_c > 0;
-
-    if (!emit) {
-      // Statically zero iterations: emit nothing and forward iter_arg initial values.
-      if (has_iter_args) {
-        INTERNAL_CHECK_SPAN(op->return_vars_.size() == op->iter_args_.size(), op->span_)
-            << "ForStmt return_vars/iter_args size mismatch in zero-trip guarded chunk split";
-        for (size_t i = 0; i < op->return_vars_.size(); ++i) {
-          substitution_map_[op->return_vars_[i].get()] = VisitExpr(op->iter_args_[i]->initValue_);
-        }
-      }
-      RestoreSubstitution(prev_loop_sub);
-      RestoreSubstitutions(prev_ia_subs);
-      return SeqStmts::Flatten(std::vector<StmtPtr>{}, sp);
-    }
-
-    if (!has_iter_args) {
-      return SplitGuarded(op, loop_var_key, base_name, loop_name.version, start_expr, step_expr, step,
-                          chunk_expr, stop_expr, n_total, prev_loop_sub, sp);
-    }
-    return SplitGuardedWithIterArgs(op, loop_var_key, base_name, loop_name.version, start_expr, step_expr,
-                                    step, chunk_expr, stop_expr, n_total, prev_loop_sub, prev_ia_subs, sp);
-  }
-
-  StmtPtr VisitStmt_(const SeqStmtsPtr& op) override {
-    std::vector<StmtPtr> new_stmts;
-    bool changed = false;
-
-    for (const auto& stmt : op->stmts_) {
-      auto new_stmt = VisitStmt(stmt);
-      if (new_stmt.get() != stmt.get()) {
-        changed = true;
-      }
-      // Flatten nested SeqStmts
-      auto seq = std::dynamic_pointer_cast<const SeqStmts>(new_stmt);
-      if (seq) {
-        for (const auto& inner : seq->stmts_) {
-          new_stmts.push_back(inner);
-        }
-      } else {
-        new_stmts.push_back(new_stmt);
-      }
-    }
-
-    if (!changed) {
-      return op;
-    }
-    return SeqStmts::Flatten(std::move(new_stmts), op->span_);
-  }
-
- private:
-  bool inside_auto_incore_ = false;
-  std::unordered_set<std::string> function_used_names_;
-  std::unordered_map<const Var*, ExprPtr> substitution_map_;
-
-  using SavedSubstitution = std::pair<const Var*, ExprPtr>;
-
-  SavedSubstitution SaveSubstitution(const Var* key) {
-    auto it = substitution_map_.find(key);
-    return {key, (it != substitution_map_.end()) ? it->second : nullptr};
-  }
-
-  void RestoreSubstitution(const SavedSubstitution& saved) {
-    if (saved.second) {
-      substitution_map_[saved.first] = saved.second;
-    } else {
-      substitution_map_.erase(saved.first);
-    }
-  }
-
-  void RestoreSubstitutions(const std::vector<SavedSubstitution>& saved) {
-    for (const auto& entry : saved) {
-      RestoreSubstitution(entry);
-    }
-  }
-
-  /**
-   * @brief Freshen all DEF vars in the body to preserve SSA uniqueness.
-   *
-   * Used when the body is visited more than once (e.g. full-chunk + remainder).
-   * Returns saved substitutions that must be restored after visiting the body.
-   */
-  std::vector<SavedSubstitution> FreshenBodyDefVars(const StmtPtr& body) {
-    std::vector<SavedSubstitution> prev_def_subs;
-    std::vector<VarPtr> body_def_vars;
-    CollectDefVars(body, body_def_vars);
-    for (const auto& var : body_def_vars) {
-      prev_def_subs.push_back(SaveSubstitution(var.get()));
-      auto fresh_name = auto_name::GenerateFreshNameLike(var->name_hint_, function_used_names_);
-      function_used_names_.insert(fresh_name);
-      auto fresh = std::make_shared<Var>(fresh_name, var->GetType(), var->span_);
-      substitution_map_[var.get()] = fresh;
-    }
-    return prev_def_subs;
-  }
-
-  /**
-   * @brief Split a chunked loop without iter_args.
-   *
-   * n_full and n_rem are ExprPtr — either ConstInt or dynamic expressions.
-   */
-  StmtPtr SplitLeadingFull(const ForStmtPtr& op, const Var* loop_var_key, const std::string& base_name,
-                           const std::optional<int>& loop_version, const ExprPtr& start_expr,
-                           const ExprPtr& step_expr, const ExprPtr& chunk_expr, const ExprPtr& n_full,
-                           const ExprPtr& n_rem, bool emit_full, bool emit_rem,
-                           const SavedSubstitution& prev_loop_sub, const Span& sp) {
-    auto zero = MakeConstIndex(0, sp);
-    auto one = MakeConstIndex(1, sp);
-    std::vector<StmtPtr> result_stmts;
-
-    if (emit_full) {
-      auto out_var = std::make_shared<Var>(
-          auto_name::BuildName(base_name, auto_name::ChunkOuterQualifier(), "idx", loop_version),
-          std::make_shared<ScalarType>(DataType::INDEX), sp);
-      auto in_var = std::make_shared<Var>(
-          auto_name::BuildName(base_name, auto_name::ChunkInnerQualifier(), "idx", loop_version),
-          std::make_shared<ScalarType>(DataType::INDEX), sp);
-
-      // i = start + (i_out * C + i_in) * step
-      substitution_map_[loop_var_key] =
-          MakeAdd(start_expr, MakeMul(MakeAdd(MakeMul(out_var, chunk_expr), in_var), step_expr));
-      auto inner_body = VisitStmt(op->body_);
-
-      auto inner_for = std::make_shared<ForStmt>(
-          in_var, zero, chunk_expr, one, std::vector<IterArgPtr>{}, inner_body, std::vector<VarPtr>{}, sp,
-          op->kind_, std::nullopt, MakeLoopAttrs(op->attrs_, LoopOrigin::ChunkInner));
-      auto outer_for = std::make_shared<ForStmt>(
-          out_var, zero, n_full, one, std::vector<IterArgPtr>{}, inner_for, std::vector<VarPtr>{}, sp,
-          op->kind_, std::nullopt, MakeLoopAttrs(op->attrs_, LoopOrigin::ChunkOuter));
-      result_stmts.push_back(outer_for);
-    }
-
-    if (emit_rem) {
-      auto rem_var = std::make_shared<Var>(
-          auto_name::BuildName(base_name, auto_name::ChunkRemainderQualifier(), "idx", loop_version),
-          std::make_shared<ScalarType>(DataType::INDEX), sp);
-
-      // i = start + (n_full * C + i_rem) * step
-      substitution_map_[loop_var_key] =
-          MakeAdd(start_expr, MakeMul(MakeAdd(MakeMul(n_full, chunk_expr), rem_var), step_expr));
-
-      std::vector<SavedSubstitution> prev_def_subs;
-      if (emit_full) {
-        prev_def_subs = FreshenBodyDefVars(op->body_);
-      }
-      auto rem_body = VisitStmt(op->body_);
-      RestoreSubstitutions(prev_def_subs);
-
-      auto rem_for = std::make_shared<ForStmt>(rem_var, zero, n_rem, one, std::vector<IterArgPtr>{}, rem_body,
-                                               std::vector<VarPtr>{}, sp, op->kind_, std::nullopt,
-                                               MakeLoopAttrs(op->attrs_, LoopOrigin::ChunkRemainder));
-      result_stmts.push_back(rem_for);
-    }
-
-    RestoreSubstitution(prev_loop_sub);
-    return MakeResultStmt(result_stmts, sp);
-  }
-
-  /**
-   * @brief Split a chunked loop with iter_args (SSA propagation).
-   *
-   * n_full and n_rem are ExprPtr — either ConstInt or dynamic expressions.
-   */
-  StmtPtr SplitLeadingFullWithIterArgs(const ForStmtPtr& op, const Var* loop_var_key,
-                                       const std::string& base_name, const std::optional<int>& loop_version,
-                                       const ExprPtr& start_expr, const ExprPtr& step_expr,
-                                       const ExprPtr& chunk_expr, const ExprPtr& n_full, const ExprPtr& n_rem,
-                                       bool emit_full, bool emit_rem, const SavedSubstitution& prev_loop_sub,
-                                       const std::vector<SavedSubstitution>& prev_ia_subs, const Span& sp) {
-    auto zero = MakeConstIndex(0, sp);
-    auto one = MakeConstIndex(1, sp);
-    std::vector<StmtPtr> result_stmts;
-    std::vector<VarPtr> final_return_vars;
-
-    if (emit_full) {
-      auto out_var = std::make_shared<Var>(
-          auto_name::BuildName(base_name, auto_name::ChunkOuterQualifier(), "idx", loop_version),
-          std::make_shared<ScalarType>(DataType::INDEX), sp);
-      auto in_var = std::make_shared<Var>(
-          auto_name::BuildName(base_name, auto_name::ChunkInnerQualifier(), "idx", loop_version),
-          std::make_shared<ScalarType>(DataType::INDEX), sp);
-
-      std::vector<IterArgPtr> outer_iter_args;
-      std::vector<VarPtr> outer_return_vars;
-      std::vector<IterArgPtr> inner_iter_args;
-      std::vector<VarPtr> inner_return_vars;
-
-      for (const auto& ia : op->iter_args_) {
-        auto visited_init = VisitExpr(ia->initValue_);
-        auto ia_name = auto_name::Parse(ia->name_hint_);
-        auto outer_ia = std::make_shared<IterArg>(
-            auto_name::BuildName(ia_name.base_name, auto_name::ChunkOuterQualifier(), "iter",
-                                 ia_name.version),
-            ia->GetType(), visited_init, ia->span_);
-        auto outer_rv = std::make_shared<Var>(
-            auto_name::BuildName(ia_name.base_name, auto_name::ChunkOuterQualifier(), "rv", ia_name.version),
-            ia->GetType(), ia->span_);
-        outer_iter_args.push_back(outer_ia);
-        outer_return_vars.push_back(outer_rv);
-
-        auto inner_ia = std::make_shared<IterArg>(
-            auto_name::BuildName(ia_name.base_name, auto_name::ChunkInnerQualifier(), "iter",
-                                 ia_name.version),
-            ia->GetType(), ExprPtr(outer_ia), ia->span_);
-        auto inner_rv = std::make_shared<Var>(
-            auto_name::BuildName(ia_name.base_name, auto_name::ChunkInnerQualifier(), "rv", ia_name.version),
-            ia->GetType(), ia->span_);
-        inner_iter_args.push_back(inner_ia);
-        inner_return_vars.push_back(inner_rv);
-
-        substitution_map_[ia.get()] = inner_ia;
-      }
-
-      // i = start + (i_out * C + i_in) * step
-      substitution_map_[loop_var_key] =
-          MakeAdd(start_expr, MakeMul(MakeAdd(MakeMul(out_var, chunk_expr), in_var), step_expr));
-      auto inner_body = VisitStmt(op->body_);
-
-      auto inner_for = std::make_shared<ForStmt>(in_var, zero, chunk_expr, one, inner_iter_args, inner_body,
-                                                 inner_return_vars, sp, op->kind_, std::nullopt,
-                                                 MakeLoopAttrs(op->attrs_, LoopOrigin::ChunkInner));
-      auto outer_yield = std::make_shared<YieldStmt>(
-          std::vector<ExprPtr>(inner_return_vars.begin(), inner_return_vars.end()), sp);
-      auto outer_body = SeqStmts::Flatten(std::vector<StmtPtr>{inner_for, outer_yield}, sp);
-
-      auto outer_for = std::make_shared<ForStmt>(out_var, zero, n_full, one, outer_iter_args, outer_body,
-                                                 outer_return_vars, sp, op->kind_, std::nullopt,
-                                                 MakeLoopAttrs(op->attrs_, LoopOrigin::ChunkOuter));
-
-      result_stmts.push_back(outer_for);
-      final_return_vars = outer_return_vars;
-    }
-
-    if (emit_rem) {
-      auto rem_var = std::make_shared<Var>(
-          auto_name::BuildName(base_name, auto_name::ChunkRemainderQualifier(), "idx", loop_version),
-          std::make_shared<ScalarType>(DataType::INDEX), sp);
-
-      std::vector<IterArgPtr> rem_iter_args;
-      std::vector<VarPtr> rem_return_vars;
-
-      for (size_t i = 0; i < op->iter_args_.size(); ++i) {
-        const auto& ia = op->iter_args_[i];
-        ExprPtr rem_init = emit_full ? ExprPtr(final_return_vars[i]) : VisitExpr(ia->initValue_);
-        auto ia_name = auto_name::Parse(ia->name_hint_);
-        auto rem_ia = std::make_shared<IterArg>(
-            auto_name::BuildName(ia_name.base_name, auto_name::ChunkRemainderQualifier(), "iter",
-                                 ia_name.version),
-            ia->GetType(), rem_init, ia->span_);
-        auto rem_rv = std::make_shared<Var>(
-            auto_name::BuildName(ia_name.base_name, auto_name::ChunkRemainderQualifier(), "rv",
-                                 ia_name.version),
-            ia->GetType(), ia->span_);
-        rem_iter_args.push_back(rem_ia);
-        rem_return_vars.push_back(rem_rv);
-
-        substitution_map_[ia.get()] = rem_ia;
-      }
-
-      // i = start + (n_full * C + i_rem) * step
-      substitution_map_[loop_var_key] =
-          MakeAdd(start_expr, MakeMul(MakeAdd(MakeMul(n_full, chunk_expr), rem_var), step_expr));
-
-      std::vector<SavedSubstitution> prev_def_subs;
-      if (emit_full) {
-        prev_def_subs = FreshenBodyDefVars(op->body_);
-      }
-      auto rem_body = VisitStmt(op->body_);
-      RestoreSubstitutions(prev_def_subs);
-
-      auto rem_for = std::make_shared<ForStmt>(rem_var, zero, n_rem, one, rem_iter_args, rem_body,
-                                               rem_return_vars, sp, op->kind_, std::nullopt,
-                                               MakeLoopAttrs(op->attrs_, LoopOrigin::ChunkRemainder));
-
-      result_stmts.push_back(rem_for);
-      final_return_vars = rem_return_vars;
-    }
-
-    INTERNAL_CHECK_SPAN(op->return_vars_.size() == final_return_vars.size(), op->span_)
-        << "SplitChunkedLoops produced mismatched return vars";
-    for (size_t i = 0; i < op->return_vars_.size(); ++i) {
-      substitution_map_[op->return_vars_[i].get()] = final_return_vars[i];
-    }
-
-    RestoreSubstitution(prev_loop_sub);
-    RestoreSubstitutions(prev_ia_subs);
-
-    return MakeResultStmt(result_stmts, sp);
-  }
-
-  /**
-   * @brief Guarded split without iter_args.
-   *
-   * Emits a single outer loop over ceil(trip_count / C) chunks and an inner loop
-   * of size C, with the body wrapped in `if (idx < stop)` so out-of-range
-   * iterations become no-ops. This preserves a single-kernel outline for dynamic
-   * bounds and loops with cross-iteration state.
-   */
-  StmtPtr SplitGuarded(const ForStmtPtr& op, const Var* loop_var_key, const std::string& base_name,
-                       const std::optional<int>& loop_version, const ExprPtr& start_expr,
-                       const ExprPtr& step_expr, int64_t step, const ExprPtr& chunk_expr,
-                       const ExprPtr& stop_expr, const ExprPtr& n_total,
-                       const SavedSubstitution& prev_loop_sub, const Span& sp) {
-    auto zero = MakeConstIndex(0, sp);
-    auto one = MakeConstIndex(1, sp);
-
-    auto out_var = std::make_shared<Var>(
-        auto_name::BuildName(base_name, auto_name::ChunkOuterQualifier(), "idx", loop_version),
-        std::make_shared<ScalarType>(DataType::INDEX), sp);
-    auto in_var = std::make_shared<Var>(
-        auto_name::BuildName(base_name, auto_name::ChunkInnerQualifier(), "idx", loop_version),
-        std::make_shared<ScalarType>(DataType::INDEX), sp);
-
-    // idx = start + (out_var * C + in_var) * step
-    ExprPtr idx_expr = MakeAdd(
-        start_expr, MakeMul(MakeAdd(MakeMul(out_var, chunk_expr, sp), in_var, sp), step_expr, sp), sp);
-    substitution_map_[loop_var_key] = idx_expr;
-    auto visited_body = VisitStmt(op->body_);
-
-    // Guard: for step > 0 use `idx < stop`, for step < 0 use `idx > stop`.
-    auto cond = step > 0 ? MakeLt(idx_expr, stop_expr, sp) : MakeGt(idx_expr, stop_expr, sp);
-    auto if_stmt =
-        std::make_shared<IfStmt>(cond, visited_body, std::optional<StmtPtr>{}, std::vector<VarPtr>{}, sp);
-
-    auto inner_for = std::make_shared<ForStmt>(in_var, zero, chunk_expr, one, std::vector<IterArgPtr>{},
-                                               if_stmt, std::vector<VarPtr>{}, sp, op->kind_, std::nullopt,
-                                               MakeLoopAttrs(op->attrs_, LoopOrigin::ChunkInner));
-    auto outer_for = std::make_shared<ForStmt>(out_var, zero, n_total, one, std::vector<IterArgPtr>{},
-                                               inner_for, std::vector<VarPtr>{}, sp, op->kind_, std::nullopt,
-                                               MakeLoopAttrs(op->attrs_, LoopOrigin::ChunkOuter));
-
-    RestoreSubstitution(prev_loop_sub);
-    return outer_for;
-  }
-
-  /**
-   * @brief Guarded split with iter_args (SSA propagation through IfStmt phi).
-   *
-   * Wraps the body in an IfStmt whose return_vars act as phi nodes. The then
-   * branch ends with the user body's own YieldStmt; the else branch yields the
-   * unchanged inner iter_args. The inner loop's trailing YieldStmt references
-   * the IfStmt's phi return_vars, threading loop-carried state through both
-   * guarded and skipped iterations.
-   */
-  StmtPtr SplitGuardedWithIterArgs(const ForStmtPtr& op, const Var* loop_var_key,
-                                   const std::string& base_name, const std::optional<int>& loop_version,
-                                   const ExprPtr& start_expr, const ExprPtr& step_expr, int64_t step,
-                                   const ExprPtr& chunk_expr, const ExprPtr& stop_expr,
-                                   const ExprPtr& n_total, const SavedSubstitution& prev_loop_sub,
-                                   const std::vector<SavedSubstitution>& prev_ia_subs, const Span& sp) {
-    auto zero = MakeConstIndex(0, sp);
-    auto one = MakeConstIndex(1, sp);
-
-    auto out_var = std::make_shared<Var>(
-        auto_name::BuildName(base_name, auto_name::ChunkOuterQualifier(), "idx", loop_version),
-        std::make_shared<ScalarType>(DataType::INDEX), sp);
-    auto in_var = std::make_shared<Var>(
-        auto_name::BuildName(base_name, auto_name::ChunkInnerQualifier(), "idx", loop_version),
-        std::make_shared<ScalarType>(DataType::INDEX), sp);
-
-    std::vector<IterArgPtr> outer_iter_args;
-    std::vector<VarPtr> outer_return_vars;
-    std::vector<IterArgPtr> inner_iter_args;
-    std::vector<VarPtr> inner_return_vars;
-    std::vector<VarPtr> if_return_vars;
-
-    for (const auto& ia : op->iter_args_) {
-      auto visited_init = VisitExpr(ia->initValue_);
-      auto ia_name = auto_name::Parse(ia->name_hint_);
-      auto outer_ia = std::make_shared<IterArg>(
-          auto_name::BuildName(ia_name.base_name, auto_name::ChunkOuterQualifier(), "iter", ia_name.version),
-          ia->GetType(), visited_init, ia->span_);
-      auto outer_rv = std::make_shared<Var>(
-          auto_name::BuildName(ia_name.base_name, auto_name::ChunkOuterQualifier(), "rv", ia_name.version),
-          ia->GetType(), ia->span_);
-      outer_iter_args.push_back(outer_ia);
-      outer_return_vars.push_back(outer_rv);
-
-      auto inner_ia = std::make_shared<IterArg>(
-          auto_name::BuildName(ia_name.base_name, auto_name::ChunkInnerQualifier(), "iter", ia_name.version),
-          ia->GetType(), ExprPtr(outer_ia), ia->span_);
-      auto inner_rv = std::make_shared<Var>(
-          auto_name::BuildName(ia_name.base_name, auto_name::ChunkInnerQualifier(), "rv", ia_name.version),
-          ia->GetType(), ia->span_);
-      inner_iter_args.push_back(inner_ia);
-      inner_return_vars.push_back(inner_rv);
-
-      auto if_rv = std::make_shared<Var>(
-          auto_name::BuildName(ia_name.base_name, auto_name::ChunkGuardQualifier(), "rv", ia_name.version),
-          ia->GetType(), ia->span_);
-      if_return_vars.push_back(if_rv);
-
-      substitution_map_[ia.get()] = inner_ia;
-    }
-
-    // idx = start + (out_var * C + in_var) * step
-    ExprPtr idx_expr = MakeAdd(
-        start_expr, MakeMul(MakeAdd(MakeMul(out_var, chunk_expr, sp), in_var, sp), step_expr, sp), sp);
-    substitution_map_[loop_var_key] = idx_expr;
-    auto visited_body = VisitStmt(op->body_);
-
-    // Else branch: pass through current inner iter_args unchanged.
-    std::vector<ExprPtr> else_yield_values(inner_iter_args.begin(), inner_iter_args.end());
-    auto else_yield = std::make_shared<YieldStmt>(std::move(else_yield_values), sp);
-
-    // Guarded IfStmt with phi return_vars.
-    // For step > 0 use `idx < stop`, for step < 0 use `idx > stop`.
-    auto cond = step > 0 ? MakeLt(idx_expr, stop_expr, sp) : MakeGt(idx_expr, stop_expr, sp);
-    auto if_stmt =
-        std::make_shared<IfStmt>(cond, visited_body, std::optional<StmtPtr>{else_yield}, if_return_vars, sp);
-
-    // Inner loop body: SeqStmts { IfStmt, YieldStmt(if_return_vars) }
-    auto inner_trailing_yield =
-        std::make_shared<YieldStmt>(std::vector<ExprPtr>(if_return_vars.begin(), if_return_vars.end()), sp);
-    auto inner_body = SeqStmts::Flatten(std::vector<StmtPtr>{if_stmt, inner_trailing_yield}, sp);
-
-    auto inner_for = std::make_shared<ForStmt>(in_var, zero, chunk_expr, one, inner_iter_args, inner_body,
-                                               inner_return_vars, sp, op->kind_, std::nullopt,
-                                               MakeLoopAttrs(op->attrs_, LoopOrigin::ChunkInner));
-
-    // Outer loop body: SeqStmts { inner_for, YieldStmt(inner_return_vars) }
-    auto outer_yield = std::make_shared<YieldStmt>(
-        std::vector<ExprPtr>(inner_return_vars.begin(), inner_return_vars.end()), sp);
-    auto outer_body = SeqStmts::Flatten(std::vector<StmtPtr>{inner_for, outer_yield}, sp);
-
-    auto outer_for = std::make_shared<ForStmt>(out_var, zero, n_total, one, outer_iter_args, outer_body,
-                                               outer_return_vars, sp, op->kind_, std::nullopt,
-                                               MakeLoopAttrs(op->attrs_, LoopOrigin::ChunkOuter));
-
-    INTERNAL_CHECK_SPAN(op->return_vars_.size() == outer_return_vars.size(), op->span_)
-        << "SplitChunkedLoops guarded produced mismatched return vars";
-    for (size_t i = 0; i < op->return_vars_.size(); ++i) {
-      substitution_map_[op->return_vars_[i].get()] = outer_return_vars[i];
-    }
-
-    RestoreSubstitution(prev_loop_sub);
-    RestoreSubstitutions(prev_ia_subs);
-    return outer_for;
-  }
-};
-
-/**
- * @brief Transform a function by splitting chunked loops.
- */
-FunctionPtr TransformSplitChunkedLoops(const FunctionPtr& func) {
-  INTERNAL_CHECK(func) << "SplitChunkedLoops cannot run on null function";
-
-  ChunkedLoopSplitter splitter;
-  splitter.SeedUsedNames(func);
-  auto new_body = splitter.VisitStmt(func->body_);
-
-  if (new_body.get() == func->body_.get()) {
-    return func;
-  }
-
-  auto new_func = MutableCopy(func);
-  new_func->body_ = std::move(new_body);
-  return new_func;
-}
-
-}  // namespace
-
-// Factory function
-namespace pass {
-Pass SplitChunkedLoops() {
-  return CreateFunctionPass(TransformSplitChunkedLoops, "SplitChunkedLoops", kSplitChunkedLoopsProperties);
-}
-}  // namespace pass
-
-}  // namespace ir
-}  // namespace pypto
diff --git a/src/ir/transforms/structural_equal.cpp b/src/ir/transforms/structural_equal.cpp
index 612045582..bc2aa3ada 100644
--- a/src/ir/transforms/structural_equal.cpp
+++ b/src/ir/transforms/structural_equal.cpp
@@ -876,8 +876,6 @@ bool StructuralEqualImpl<AssertMode>::Equal(const IRNodePtr& lhs, const IRNodePt
   EQUAL_DISPATCH(ReturnStmt)
   EQUAL_DISPATCH(ForStmt)
   EQUAL_DISPATCH(WhileStmt)
-  EQUAL_DISPATCH(InCoreScopeStmt)
-  EQUAL_DISPATCH(AutoInCoreScopeStmt)
   EQUAL_DISPATCH(ClusterScopeStmt)
   EQUAL_DISPATCH(HierarchyScopeStmt)
   EQUAL_DISPATCH(SpmdScopeStmt)
diff --git a/src/ir/transforms/structural_hash.cpp b/src/ir/transforms/structural_hash.cpp
index 7ef4e05bc..22d20ef32 100644
--- a/src/ir/transforms/structural_hash.cpp
+++ b/src/ir/transforms/structural_hash.cpp
@@ -552,8 +552,6 @@ StructuralHasher::result_type StructuralHasher::HashNode(const IRNodePtr& node)
   HASH_DISPATCH(ReturnStmt)
   HASH_DISPATCH(ForStmt)
   HASH_DISPATCH(WhileStmt)
-  HASH_DISPATCH(InCoreScopeStmt)
-  HASH_DISPATCH(AutoInCoreScopeStmt)
   HASH_DISPATCH(ClusterScopeStmt)
   HASH_DISPATCH(HierarchyScopeStmt)
   HASH_DISPATCH(SpmdScopeStmt)
diff --git a/src/ir/transforms/utils/transform_utils.cpp b/src/ir/transforms/utils/transform_utils.cpp
index 762bc18b8..0684b044d 100644
--- a/src/ir/transforms/utils/transform_utils.cpp
+++ b/src/ir/transforms/utils/transform_utils.cpp
@@ -255,8 +255,6 @@ void CollectDefVars(const StmtPtr& stmt, std::vector<VarPtr>& result) {
       }
       break;
     }
-    case ObjectKind::InCoreScopeStmt:
-    case ObjectKind::AutoInCoreScopeStmt:
     case ObjectKind::ClusterScopeStmt:
     case ObjectKind::HierarchyScopeStmt:
     case ObjectKind::SpmdScopeStmt: {
diff --git a/src/ir/transforms/visitor.cpp b/src/ir/transforms/visitor.cpp
index 19752b253..87843057e 100644
--- a/src/ir/transforms/visitor.cpp
+++ b/src/ir/transforms/visitor.cpp
@@ -213,16 +213,6 @@ void IRVisitor::VisitStmt_(const WhileStmtPtr& op) {
   }
 }
 
-void IRVisitor::VisitStmt_(const InCoreScopeStmtPtr& op) {
-  INTERNAL_CHECK_SPAN(op->body_, op->span_) << "InCoreScopeStmt has null body";
-  VisitStmt(op->body_);
-}
-
-void IRVisitor::VisitStmt_(const AutoInCoreScopeStmtPtr& op) {
-  INTERNAL_CHECK_SPAN(op->body_, op->span_) << "AutoInCoreScopeStmt has null body";
-  VisitStmt(op->body_);
-}
-
 void IRVisitor::VisitStmt_(const ClusterScopeStmtPtr& op) {
   INTERNAL_CHECK_SPAN(op->body_, op->span_) << "ClusterScopeStmt has null body";
   VisitStmt(op->body_);
diff --git a/src/ir/verifier/property_verifier_registry.cpp b/src/ir/verifier/property_verifier_registry.cpp
index 6553e104d..606787fd4 100644
--- a/src/ir/verifier/property_verifier_registry.cpp
+++ b/src/ir/verifier/property_verifier_registry.cpp
@@ -44,7 +44,6 @@ PropertyVerifierRegistry::PropertyVerifierRegistry() {
   Register(IRProperty::NoNestedCalls, CreateNoNestedCallPropertyVerifier);
   Register(IRProperty::NormalizedStmtStructure, CreateNormalizedStmtPropertyVerifier);
   Register(IRProperty::NoRedundantBlocks, CreateNoRedundantBlocksPropertyVerifier);
-  Register(IRProperty::SplitIncoreOrch, CreateSplitIncoreOrchPropertyVerifier);
   Register(IRProperty::ClusterOutlined, CreateClusterOutlinedPropertyVerifier);
   Register(IRProperty::HierarchyOutlined, CreateHierarchyOutlinedPropertyVerifier);
   Register(IRProperty::HasMemRefs, CreateHasMemRefsPropertyVerifier);
@@ -57,7 +56,6 @@ PropertyVerifierRegistry::PropertyVerifierRegistry() {
   Register(IRProperty::UseAfterDef, CreateUseAfterDefPropertyVerifier);
   Register(IRProperty::StructuredCtrlFlow, CreateStructuredCtrlFlowPropertyVerifier);
   Register(IRProperty::OutParamNotShadowed, CreateOutParamNotShadowedPropertyVerifier);
-  Register(IRProperty::NoNestedInCore, CreateNoNestedIncorePropertyVerifier);
   Register(IRProperty::InOutUseValid, CreateInOutUseValidPropertyVerifier);
 }
 
diff --git a/tests/ut/codegen/test_orchestration_codegen.py b/tests/ut/codegen/test_orchestration_codegen.py
index e724fdc94..7778162ba 100644
--- a/tests/ut/codegen/test_orchestration_codegen.py
+++ b/tests/ut/codegen/test_orchestration_codegen.py
@@ -1152,52 +1152,6 @@ def orch(
         assert "acc__loop_state" not in code
         assert "params_t1.add_input(acc);" in code
 
-    def test_for_loop_with_inplace_return_after_passes(self):
-        """Test inplace detection when return var has compound auto-name suffixes from pass pipeline.
-
-        When an Opaque function with auto_incore + parallel(chunk=) goes through the full
-        pass pipeline (SSA → split_chunked_loops → interchange_chunk_loops → outline), the
-        return var acquires compound suffixes like "__co_l0_rv_v1". GetSSABaseName must
-        strip all of these to match the return var back to the original param name for correct
-        inplace detection (2 arg slots, not 3).
-        """
-        backend.reset_for_testing()
-        backend.set_backend_type(BackendType.Ascend910B)
-
-        @pl.program
-        class ChunkedInplaceProgram:
-            @pl.function(type=pl.FunctionType.Opaque)
-            def add_one(
-                self,
-                input_tensor: pl.Tensor[[1024, 256], pl.FP32],
-                output_tensor: pl.Tensor[[1024, 256], pl.FP32],
-            ) -> pl.Tensor[[1024, 256], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for r in pl.parallel(0, 1024, 1, chunk=64, chunk_policy="leading_full"):
-                        row_tile = pl.slice(input_tensor, [1, 256], [r, 0])
-                        row_result = pl.add(row_tile, 1.0)
-                        output_tensor = pl.assemble(output_tensor, row_result, [r, 0])
-                return output_tensor
-
-        # Run the full pass pipeline to produce compound SSA suffixes
-        pm = PassManager.get_strategy(OptimizationStrategy.Default)
-        transformed = pm.run_passes(ChunkedInplaceProgram)
-
-        code = _generate_orch_code(transformed)
-
-        # Inplace detection: output_tensor return var should match the param,
-        # so only 2 orch arg slots (input_tensor + output_tensor), not 3
-        assert "expected_arg_count = 2" in code
-        assert "from_tensor_arg(orch_args.tensor(0))" in code  # input_tensor
-        assert "from_tensor_arg(orch_args.tensor(1))" in code  # output_tensor
-
-        # No third orch entry for the compound-named return var
-        assert "orch_args.tensor(2)" not in code
-
-        # Task params should use ext_output_tensor (the inplace param), not a separate buffer
-        assert "ext_output_tensor)" in code
-        assert "ext_output_tensor_iter" not in code
-
     def test_tensor_assemble_uses_precomputed_view(self):
         """tensor.assemble should lower to a pre-generated target view, not a host copy."""
 
diff --git a/tests/ut/codegen/test_pto_codegen_cross_core.py b/tests/ut/codegen/test_pto_codegen_cross_core.py
index 05a539f5b..976f4e960 100644
--- a/tests/ut/codegen/test_pto_codegen_cross_core.py
+++ b/tests/ut/codegen/test_pto_codegen_cross_core.py
@@ -217,9 +217,7 @@ def _compile_and_generate(program) -> dict[str, str]:
             passes.unroll_loops,
             passes.convert_to_ssa,
             passes.flatten_call_expr,
-            passes.split_chunked_loops,
-            passes.interchange_chunk_loops,
-            passes.outline_incore_scopes,
+            passes.outline_hierarchy_scopes,
             passes.outline_cluster_scopes,
             passes.convert_tensor_to_tile_ops,
             passes.flatten_tile_nd_to_2d,
@@ -511,9 +509,7 @@ def _expand_and_generate(program) -> dict[str, str]:
         pipeline.add_pass(passes.unroll_loops())
         pipeline.add_pass(passes.convert_to_ssa())
         pipeline.add_pass(passes.flatten_call_expr())
-        pipeline.add_pass(passes.split_chunked_loops())
-        pipeline.add_pass(passes.interchange_chunk_loops())
-        pipeline.add_pass(passes.outline_incore_scopes())
+        pipeline.add_pass(passes.outline_hierarchy_scopes())
         pipeline.add_pass(passes.outline_cluster_scopes())
         pipeline.add_pass(passes.convert_tensor_to_tile_ops())
         pipeline.add_pass(passes.flatten_tile_nd_to_2d())
diff --git a/tests/ut/debug/test_torch_codegen.py b/tests/ut/debug/test_torch_codegen.py
index 9aec50c90..dfc3d587d 100644
--- a/tests/ut/debug/test_torch_codegen.py
+++ b/tests/ut/debug/test_torch_codegen.py
@@ -429,7 +429,7 @@ def test_scope_is_transparent():
     b = _tensor_var("b", [64])
     call = _op_call("tensor.neg", [a])
     assign = ir.AssignStmt(b, call, _span())
-    scope = ir.InCoreScopeStmt(body=assign, span=_span())
+    scope = ir.HierarchyScopeStmt(level=ir.Level.CORE_GROUP, body=assign, span=_span())
     func = _simple_function("f", [a], scope)
     code = torch_codegen(func)
     assert "torch.neg(a)" in code
diff --git a/tests/ut/ir/parser/test_at_optimizations.py b/tests/ut/ir/parser/test_at_optimizations.py
index de5fe3234..e42389a0a 100644
--- a/tests/ut/ir/parser/test_at_optimizations.py
+++ b/tests/ut/ir/parser/test_at_optimizations.py
@@ -9,13 +9,11 @@
 
 """Tests for pl.at(..., optimizations=[...]) parsing.
 
-Covers issue #1030: the optimizations= list lets users express ``pl.split(...)``
-and ``pl.auto_chunk`` independently. The legacy ``optimization=`` and top-level
-``split=`` kwargs remain functional but emit DeprecationWarning, and mixing the
-new ``optimizations=`` with either deprecated kwarg is a hard error.
+After the removal of InCoreScopeStmt / AutoInCoreScopeStmt, ``pl.at(...)`` always
+produces a HierarchyScopeStmt. At ``Level.CORE_GROUP``, the ``optimizations=``
+list accepts ``pl.split(mode)`` to populate the scope's ``split`` field.
 """
 
-import warnings
 from typing import TypeVar
 
 import pypto.language as pl
@@ -38,11 +36,11 @@ def _find_scope(stmt, scope_type: type[T]) -> T | None:
     return None
 
 
-# ─── New API: optimizations=[pl.split(...)] → InCore with split ──────────────
+# ─── optimizations=[pl.split(...)] → HierarchyScopeStmt with split ───────────
 
 
 def test_parse_optimizations_split_only_up_down():
-    """optimizations=[pl.split(UP_DOWN)] → InCore with split=UP_DOWN."""
+    """optimizations=[pl.split(UP_DOWN)] → HierarchyScopeStmt with split=UP_DOWN."""
 
     @pl.function
     def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
@@ -50,13 +48,14 @@ def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
             y = pl.add(x, x)
         return y
 
-    scope = _find_scope(f.body, ir.InCoreScopeStmt)
+    scope = _find_scope(f.body, ir.HierarchyScopeStmt)
     assert scope is not None
+    assert scope.level == ir.Level.CORE_GROUP
     assert scope.split == ir.SplitMode.UP_DOWN
 
 
 def test_parse_optimizations_split_only_left_right():
-    """optimizations=[pl.split(LEFT_RIGHT)] → InCore with split=LEFT_RIGHT."""
+    """optimizations=[pl.split(LEFT_RIGHT)] → HierarchyScopeStmt with split=LEFT_RIGHT."""
 
     @pl.function
     def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
@@ -64,81 +63,13 @@ def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
             y = pl.add(x, x)
         return y
 
-    scope = _find_scope(f.body, ir.InCoreScopeStmt)
+    scope = _find_scope(f.body, ir.HierarchyScopeStmt)
     assert scope is not None
     assert scope.split == ir.SplitMode.LEFT_RIGHT
 
 
-# ─── New API: optimizations=[pl.auto_chunk] → AutoInCore (no split) ──────────
-
-
-def test_parse_optimizations_auto_chunk_only():
-    """optimizations=[pl.auto_chunk] → AutoInCore with no split."""
-
-    @pl.function
-    def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-        with pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.auto_chunk]):
-            for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                x = pl.add(x, x)
-        return x
-
-    scope = _find_scope(f.body, ir.AutoInCoreScopeStmt)
-    assert scope is not None
-    assert scope.split is None
-
-
-# ─── New API: optimizations=[pl.auto_chunk, pl.split(...)] → AutoInCore + split
-
-
-def test_parse_optimizations_auto_chunk_with_split():
-    """optimizations=[pl.auto_chunk, pl.split(UP_DOWN)] → AutoInCore with split."""
-
-    @pl.function
-    def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-        with pl.at(
-            level=pl.Level.CORE_GROUP,
-            optimizations=[pl.auto_chunk, pl.split(pl.SplitMode.UP_DOWN)],
-        ):
-            for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                x = pl.add(x, x)
-        return x
-
-    scope = _find_scope(f.body, ir.AutoInCoreScopeStmt)
-    assert scope is not None
-    assert scope.split == ir.SplitMode.UP_DOWN
-
-
-def test_parse_optimizations_order_independent():
-    """List order does not affect the produced IR."""
-
-    @pl.function
-    def f1(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-        with pl.at(
-            level=pl.Level.CORE_GROUP,
-            optimizations=[pl.auto_chunk, pl.split(pl.SplitMode.LEFT_RIGHT)],
-        ):
-            for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                x = pl.add(x, x)
-        return x
-
-    @pl.function
-    def f2(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-        with pl.at(
-            level=pl.Level.CORE_GROUP,
-            optimizations=[pl.split(pl.SplitMode.LEFT_RIGHT), pl.auto_chunk],
-        ):
-            for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                x = pl.add(x, x)
-        return x
-
-    s1 = _find_scope(f1.body, ir.AutoInCoreScopeStmt)
-    s2 = _find_scope(f2.body, ir.AutoInCoreScopeStmt)
-    assert s1 is not None and s2 is not None
-    assert s1.split == s2.split == ir.SplitMode.LEFT_RIGHT
-
-
-def test_parse_optimizations_empty_list_is_plain_incore():
-    """optimizations=[] → InCore with no split."""
+def test_parse_optimizations_empty_list_is_plain_hierarchy():
+    """optimizations=[] → HierarchyScopeStmt with no split."""
 
     @pl.function
     def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
@@ -146,138 +77,24 @@ def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
             y = pl.add(x, x)
         return y
 
-    scope = _find_scope(f.body, ir.InCoreScopeStmt)
+    scope = _find_scope(f.body, ir.HierarchyScopeStmt)
     assert scope is not None
     assert scope.split is None
 
 
-# ─── Equivalence with deprecated API ──────────────────────────────────────────
-
-
-def test_legacy_chunked_loop_optimizer_matches_new_form():
-    """Legacy bare optimizer (defaults to UP_DOWN) ≡ new auto_chunk + split(UP_DOWN)."""
-
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore", DeprecationWarning)
-
-        @pl.function
-        def legacy(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-            with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                    x = pl.add(x, x)
-            return x
+def test_parse_core_group_no_optimizations():
+    """pl.at(level=CORE_GROUP) without optimizations → plain HierarchyScopeStmt."""
 
     @pl.function
-    def new(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-        with pl.at(
-            level=pl.Level.CORE_GROUP,
-            optimizations=[pl.auto_chunk, pl.split(pl.SplitMode.UP_DOWN)],
-        ):
-            for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                x = pl.add(x, x)
-        return x
-
-    s_legacy = _find_scope(legacy.body, ir.AutoInCoreScopeStmt)
-    s_new = _find_scope(new.body, ir.AutoInCoreScopeStmt)
-    assert s_legacy is not None and s_new is not None
-    assert s_legacy.split == s_new.split == ir.SplitMode.UP_DOWN
-
-
-def test_legacy_split_kwarg_matches_new_form():
-    """Legacy top-level split= ≡ new optimizations=[pl.split(...)]."""
-
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore", DeprecationWarning)
-
-        @pl.function
-        def legacy(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-            with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.LEFT_RIGHT):
-                y = pl.add(x, x)
-            return y
-
-    @pl.function
-    def new(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-        with pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.split(pl.SplitMode.LEFT_RIGHT)]):
+    def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
+        with pl.at(level=pl.Level.CORE_GROUP):
             y = pl.add(x, x)
         return y
 
-    s_legacy = _find_scope(legacy.body, ir.InCoreScopeStmt)
-    s_new = _find_scope(new.body, ir.InCoreScopeStmt)
-    assert s_legacy is not None and s_new is not None
-    assert s_legacy.split == s_new.split == ir.SplitMode.LEFT_RIGHT
-
-
-# ─── DeprecationWarning emission ──────────────────────────────────────────────
-
-
-def test_legacy_optimization_kwarg_emits_deprecation_warning():
-    """Using the legacy optimization= kwarg emits DeprecationWarning."""
-    with pytest.warns(DeprecationWarning, match="optimizations=\\[pl.auto_chunk\\]"):
-
-        @pl.function
-        def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-            with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                    x = pl.add(x, x)
-            return x
-
-
-def test_legacy_split_kwarg_emits_deprecation_warning():
-    """Using the legacy top-level split= kwarg emits DeprecationWarning."""
-    with pytest.warns(DeprecationWarning, match="optimizations=\\[pl.split"):
-
-        @pl.function
-        def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-            with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN):
-                y = pl.add(x, x)
-            return y
-
-
-def test_new_optimizations_kwarg_emits_no_warning():
-    """The new optimizations= API emits no DeprecationWarning."""
-    with warnings.catch_warnings():
-        warnings.simplefilter("error", DeprecationWarning)
-
-        @pl.function
-        def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-            with pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.split(pl.SplitMode.UP_DOWN)]):
-                y = pl.add(x, x)
-            return y
-
-
-# ─── Hard errors when mixing new with deprecated kwargs ──────────────────────
-
-
-def test_mix_optimizations_with_legacy_optimization_errors():
-    """Cannot combine optimizations= with deprecated optimization=."""
-    with pytest.raises(ParserSyntaxError, match="Cannot mix"):
-
-        @pl.function
-        def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-            with pl.at(
-                level=pl.Level.CORE_GROUP,
-                optimizations=[pl.split(pl.SplitMode.UP_DOWN)],
-                optimization=pl.chunked_loop_optimizer,
-            ):
-                for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                    x = pl.add(x, x)
-            return x
-
-
-def test_mix_optimizations_with_legacy_split_errors():
-    """Cannot combine optimizations= with deprecated split=."""
-    with pytest.raises(ParserSyntaxError, match="Cannot mix"):
-
-        @pl.function
-        def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-            with pl.at(
-                level=pl.Level.CORE_GROUP,
-                optimizations=[pl.auto_chunk],
-                split=pl.SplitMode.UP_DOWN,
-            ):
-                for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                    x = pl.add(x, x)
-            return x
+    scope = _find_scope(f.body, ir.HierarchyScopeStmt)
+    assert scope is not None
+    assert scope.level == ir.Level.CORE_GROUP
+    assert scope.split is None
 
 
 # ─── Validation errors on optimizations= entries ──────────────────────────────
@@ -289,23 +106,11 @@ def test_optimizations_must_be_list():
 
         @pl.function
         def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-            with pl.at(level=pl.Level.CORE_GROUP, optimizations=pl.auto_chunk):  # type: ignore[arg-type]
+            with pl.at(level=pl.Level.CORE_GROUP, optimizations=pl.split(pl.SplitMode.UP_DOWN)):  # type: ignore[arg-type]
                 y = pl.add(x, x)
             return y
 
 
-def test_duplicate_auto_chunk_errors():
-    """Two pl.auto_chunk entries in the same list is an error."""
-    with pytest.raises(ParserSyntaxError, match="Duplicate.*auto_chunk"):
-
-        @pl.function
-        def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-            with pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.auto_chunk, pl.auto_chunk]):
-                for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                    x = pl.add(x, x)
-            return x
-
-
 def test_duplicate_split_errors():
     """Two pl.split(...) entries in the same list is an error."""
     with pytest.raises(ParserSyntaxError, match="Duplicate.*split"):
@@ -343,28 +148,11 @@ def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
 
 
 def test_split_factory_rejects_none_at_runtime():
-    """pl.split() also rejects SplitMode.NONE at construction time.
-
-    The parser-level check above catches DSL source. This factory-level
-    check guards runtime construction (e.g., in scripts that build Split
-    instances directly), per the project rule that DSL helpers should
-    validate user input rather than relying on backend C++ checks.
-    """
+    """pl.split() also rejects SplitMode.NONE at construction time."""
     with pytest.raises(ValueError, match=r"SplitMode\.NONE"):
         pl.split(pl.SplitMode.NONE)
 
 
-def test_auto_chunk_on_non_core_group_errors():
-    """pl.auto_chunk is only valid at CORE_GROUP."""
-    with pytest.raises(ParserSyntaxError, match="CORE_GROUP"):
-
-        @pl.function
-        def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-            with pl.at(level=pl.Level.HOST, optimizations=[pl.auto_chunk]):
-                y = pl.add(x, x)
-            return y
-
-
 def test_split_on_non_core_group_errors():
     """pl.split(...) is only valid at CORE_GROUP."""
     with pytest.raises(ParserSyntaxError, match="CORE_GROUP"):
@@ -379,20 +167,6 @@ def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
 # ─── Fully qualified pl.optimizations.* forms ────────────────────────────────
 
 
-def test_fully_qualified_auto_chunk():
-    """pl.optimizations.auto_chunk also works."""
-
-    @pl.function
-    def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-        with pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.optimizations.auto_chunk]):
-            for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                x = pl.add(x, x)
-        return x
-
-    scope = _find_scope(f.body, ir.AutoInCoreScopeStmt)
-    assert scope is not None
-
-
 def test_fully_qualified_split():
     """pl.optimizations.split(...) also works."""
 
@@ -405,7 +179,7 @@ def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
             y = pl.add(x, x)
         return y
 
-    scope = _find_scope(f.body, ir.InCoreScopeStmt)
+    scope = _find_scope(f.body, ir.HierarchyScopeStmt)
     assert scope is not None
     assert scope.split == ir.SplitMode.UP_DOWN
 
diff --git a/tests/ut/ir/parser/test_parse_pl_at.py b/tests/ut/ir/parser/test_parse_pl_at.py
index 2a0589191..72376d181 100644
--- a/tests/ut/ir/parser/test_parse_pl_at.py
+++ b/tests/ut/ir/parser/test_parse_pl_at.py
@@ -7,9 +7,8 @@
 # See LICENSE in the root of the software repository for the full text of the License.
 # -----------------------------------------------------------------------------------------------------------
 
-"""Tests for parsing pl.at(level=..., role=...) (Step 04)."""
+"""Tests for parsing pl.at(level=..., role=...)."""
 
-import warnings
 from typing import TypeVar
 
 import pypto.language as pl
@@ -150,28 +149,13 @@ def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
 # ─── Backward compatibility ───────────────────────────────────────────────
 
 
-def test_backward_compat_incore():
-    """Existing pl.incore() still works."""
-
-    @pl.function
-    def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-        with pl.incore():
-            y = pl.add(x, x)
-        return y
-
-    scope = _find_scope(f.body, ir.InCoreScopeStmt)
-    assert scope is not None
-    assert scope.scope_kind == ir.ScopeKind.InCore
-    assert not isinstance(scope, ir.HierarchyScopeStmt)
-
-
 def test_backward_compat_cluster():
     """Existing pl.cluster() still works."""
 
     @pl.function
     def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
         with pl.cluster():
-            with pl.incore():
+            with pl.at(level=pl.Level.CORE_GROUP):
                 y = pl.add(x, x)
         return y
 
@@ -198,11 +182,11 @@ def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
     assert "Role.Worker" in printed
 
 
-# ─── New pl.at() InCore / AutoInCore forms ───────────────────────────────────
+# ─── pl.at() with CORE_GROUP level ───────────────────────────────────────
 
 
-def test_parse_pl_at_core_group_incore():
-    """pl.at(level=CORE_GROUP) creates InCoreScopeStmt."""
+def test_parse_pl_at_core_group():
+    """pl.at(level=CORE_GROUP) creates HierarchyScopeStmt at CORE_GROUP."""
 
     @pl.function
     def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
@@ -210,81 +194,9 @@ def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
             y = pl.add(x, x)
         return y
 
-    scope = _find_scope(f.body, ir.InCoreScopeStmt)
-    assert scope is not None
-    assert scope.scope_kind == ir.ScopeKind.InCore
-
-
-def test_parse_pl_at_core_group_chunked_loop_optimizer_bare():
-    """pl.at(level=CORE_GROUP, optimization=pl.chunked_loop_optimizer) → AutoInCore, split=UP_DOWN."""
-
-    @pl.function
-    def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-        with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-            for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                x = pl.add(x, x)
-        return x
-
-    scope = _find_scope(f.body, ir.AutoInCoreScopeStmt)
-    assert scope is not None
-    assert scope.scope_kind == ir.ScopeKind.AutoInCore
-    assert scope.split == ir.SplitMode.UP_DOWN
-
-
-def test_parse_pl_at_core_group_chunked_loop_optimizer_with_split():
-    """pl.at(level=CORE_GROUP, optimization=chunked_loop_optimizer(split=LEFT_RIGHT)) → AutoInCore."""
-
-    @pl.function
-    def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-        with pl.at(
-            level=pl.Level.CORE_GROUP,
-            optimization=pl.chunked_loop_optimizer(split=pl.SplitMode.LEFT_RIGHT),
-        ):
-            for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                x = pl.add(x, x)
-        return x
-
-    scope = _find_scope(f.body, ir.AutoInCoreScopeStmt)
+    scope = _find_scope(f.body, ir.HierarchyScopeStmt)
     assert scope is not None
-    assert scope.scope_kind == ir.ScopeKind.AutoInCore
-    assert scope.split == ir.SplitMode.LEFT_RIGHT
-
-
-def test_parse_pl_at_optimization_on_non_core_group_errors():
-    """optimization= is not supported for non-CORE_GROUP levels."""
-    with pytest.raises(ParserSyntaxError, match="CORE_GROUP"):
-
-        @pl.function
-        def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-            with pl.at(level=pl.Level.HOST, optimization=pl.chunked_loop_optimizer):
-                y = pl.add(x, x)
-            return y
-
-
-def test_parse_pl_at_unknown_optimization_errors():
-    """optimization= with unsupported value raises error."""
-    with pytest.raises(ParserSyntaxError, match="chunked_loop_optimizer"):
-
-        @pl.function
-        def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-            with pl.at(level=pl.Level.CORE_GROUP, optimization=42):  # type: ignore[arg-type]
-                y = pl.add(x, x)
-            return y
-
-
-def test_parse_pl_at_split_mode_none_errors():
-    """chunked_loop_optimizer(split=SplitMode.NONE) raises error."""
-    with pytest.raises(ParserSyntaxError, match=r"SplitMode\.NONE"):
-
-        @pl.function
-        def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-            with pl.at(
-                level=pl.Level.CORE_GROUP,
-                optimization=pl.chunked_loop_optimizer(split=pl.SplitMode.NONE),
-            ):
-                for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                    x = pl.add(x, x)
-            return x
+    assert scope.level == ir.Level.CORE_GROUP
 
 
 def test_parse_pl_at_role_with_core_group_errors():
@@ -298,138 +210,5 @@ def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
             return y
 
 
-def test_incore_deprecation_warning():
-    """pl.incore() emits DeprecationWarning at parse time."""
-    with warnings.catch_warnings(record=True) as w:
-        warnings.simplefilter("always")
-
-        @pl.function
-        def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-            with pl.incore():
-                y = pl.add(x, x)
-            return y
-
-    assert any(issubclass(warning.category, DeprecationWarning) for warning in w)
-    assert any("pl.incore()" in str(warning.message) for warning in w)
-
-
-def test_auto_incore_deprecation_warning():
-    """pl.auto_incore() emits DeprecationWarning at parse time."""
-    with warnings.catch_warnings(record=True) as w:
-        warnings.simplefilter("always")
-
-        @pl.function
-        def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-            with pl.auto_incore():
-                for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                    x = pl.add(x, x)
-            return x
-
-    assert any(issubclass(warning.category, DeprecationWarning) for warning in w)
-    assert any("pl.auto_incore()" in str(warning.message) for warning in w)
-
-
-# ─── InCore with split ──────────────────────────────────────────────────────
-
-
-def test_parse_pl_incore_with_split():
-    """pl.incore(split=UP_DOWN) creates InCoreScopeStmt with split."""
-
-    @pl.function
-    def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-        with pl.incore(split=pl.SplitMode.UP_DOWN):
-            y = pl.add(x, x)
-        return y
-
-    scope = _find_scope(f.body, ir.InCoreScopeStmt)
-    assert scope is not None
-    assert scope.scope_kind == ir.ScopeKind.InCore
-    assert scope.split == ir.SplitMode.UP_DOWN
-
-
-def test_parse_pl_incore_with_split_left_right():
-    """pl.incore(split=LEFT_RIGHT) creates InCoreScopeStmt with LEFT_RIGHT split."""
-
-    @pl.function
-    def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-        with pl.incore(split=pl.SplitMode.LEFT_RIGHT):
-            y = pl.add(x, x)
-        return y
-
-    scope = _find_scope(f.body, ir.InCoreScopeStmt)
-    assert scope is not None
-    assert scope.scope_kind == ir.ScopeKind.InCore
-    assert scope.split == ir.SplitMode.LEFT_RIGHT
-
-
-def test_parse_pl_at_core_group_with_split():
-    """pl.at(level=CORE_GROUP, split=UP_DOWN) creates InCoreScopeStmt with split."""
-
-    @pl.function
-    def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-        with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN):
-            y = pl.add(x, x)
-        return y
-
-    scope = _find_scope(f.body, ir.InCoreScopeStmt)
-    assert scope is not None
-    assert scope.scope_kind == ir.ScopeKind.InCore
-    assert scope.split == ir.SplitMode.UP_DOWN
-
-
-def test_parse_pl_at_core_group_with_split_left_right():
-    """pl.at(level=CORE_GROUP, split=LEFT_RIGHT) creates InCoreScopeStmt with LEFT_RIGHT."""
-
-    @pl.function
-    def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-        with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.LEFT_RIGHT):
-            y = pl.add(x, x)
-        return y
-
-    scope = _find_scope(f.body, ir.InCoreScopeStmt)
-    assert scope is not None
-    assert scope.scope_kind == ir.ScopeKind.InCore
-    assert scope.split == ir.SplitMode.LEFT_RIGHT
-
-
-def test_parse_pl_at_optimization_and_split_conflict():
-    """Cannot use both optimization= and split= in pl.at()."""
-    with pytest.raises(ParserSyntaxError, match="Cannot use both"):
-
-        @pl.function
-        def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-            with pl.at(
-                level=pl.Level.CORE_GROUP,
-                optimization=pl.chunked_loop_optimizer,
-                split=pl.SplitMode.UP_DOWN,
-            ):
-                y = pl.add(x, x)
-            return y
-
-
-def test_parse_pl_at_split_on_non_core_group_errors():
-    """split= is not supported for non-CORE_GROUP levels."""
-    with pytest.raises(ParserSyntaxError, match="CORE_GROUP"):
-
-        @pl.function
-        def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-            with pl.at(level=pl.Level.HOST, split=pl.SplitMode.UP_DOWN):
-                y = pl.add(x, x)
-            return y
-
-
-def test_printer_incore_with_split_roundtrip():
-    """Python printer renders InCore scope with split and it can be re-parsed."""
-
-    @pl.function
-    def f(x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-        with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN):
-            y = pl.add(x, x)
-        return y
-
-    printed = str(f)
-    assert "pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN)" in printed
-
-
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])
diff --git a/tests/ut/ir/statements/test_scope_stmt.py b/tests/ut/ir/statements/test_scope_stmt.py
index c8fca16fa..2ec7b1b86 100644
--- a/tests/ut/ir/statements/test_scope_stmt.py
+++ b/tests/ut/ir/statements/test_scope_stmt.py
@@ -7,7 +7,7 @@
 # See LICENSE in the root of the software repository for the full text of the License.
 # -----------------------------------------------------------------------------------------------------------
 
-"""Unit tests for ScopeStmt class."""
+"""Unit tests for ScopeStmt class hierarchy."""
 
 import pypto.language as pl
 import pytest
@@ -17,35 +17,36 @@
 class TestScopeStmt:
     """Test ScopeStmt construction, fields, and operations."""
 
-    def test_scope_stmt_construction(self):
-        """Test basic InCoreScopeStmt construction."""
+    def test_hierarchy_scope_construction(self):
+        """Test basic HierarchyScopeStmt construction at CORE_GROUP (replaces InCore scope)."""
         span = ir.Span("test.py", 1, 1, 1, 10)
         var_x = ir.Var("x", ir.TensorType([64], DataType.FP32), span)
         var_y = ir.Var("y", ir.TensorType([64], DataType.FP32), span)
 
         body = ir.AssignStmt(var_y, var_x, span)
-        scope = ir.InCoreScopeStmt(body=body, span=span)
+        scope = ir.HierarchyScopeStmt(level=ir.Level.CORE_GROUP, body=body, span=span)
 
-        assert scope.scope_kind == ir.ScopeKind.InCore
+        assert scope.scope_kind == ir.ScopeKind.Hierarchy
+        assert scope.level == ir.Level.CORE_GROUP
         assert isinstance(scope, ir.ScopeStmt)
         assert isinstance(scope.body, ir.AssignStmt)
 
-    def test_scope_stmt_structural_equality(self):
-        """Test structural equality for InCoreScopeStmt."""
+    def test_hierarchy_scope_structural_equality(self):
+        """Test structural equality for HierarchyScopeStmt."""
         span = ir.Span("test.py", 1, 1, 1, 10)
         var_x = ir.Var("x", ir.TensorType([64], DataType.FP32), span)
         var_y = ir.Var("y", ir.TensorType([64], DataType.FP32), span)
 
         body1 = ir.AssignStmt(var_y, var_x, span)
-        scope1 = ir.InCoreScopeStmt(body=body1, span=span)
+        scope1 = ir.HierarchyScopeStmt(level=ir.Level.CORE_GROUP, body=body1, span=span)
 
         body2 = ir.AssignStmt(var_y, var_x, span)
-        scope2 = ir.InCoreScopeStmt(body=body2, span=span)
+        scope2 = ir.HierarchyScopeStmt(level=ir.Level.CORE_GROUP, body=body2, span=span)
 
         assert ir.structural_equal(scope1, scope2)
 
     def test_scope_stmt_printing(self):
-        """Test Python printer output for ScopeStmt."""
+        """Test Python printer output for HierarchyScopeStmt at CORE_GROUP."""
 
         @pl.program
         class TestProgram:
@@ -58,25 +59,25 @@ def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
         printed = TestProgram.as_python()
         assert "with pl.at(level=pl.Level.CORE_GROUP):" in printed
 
-    def test_scope_stmt_with_name(self):
-        """Test InCoreScopeStmt construction with a user-provided name."""
+    def test_hierarchy_scope_with_name(self):
+        """Test HierarchyScopeStmt construction with a user-provided name."""
         span = ir.Span("test.py", 1, 1, 1, 10)
         var_x = ir.Var("x", ir.TensorType([64], DataType.FP32), span)
         var_y = ir.Var("y", ir.TensorType([64], DataType.FP32), span)
         body = ir.AssignStmt(var_y, var_x, span)
 
-        scope = ir.InCoreScopeStmt(name_hint="my_kernel", body=body, span=span)
+        scope = ir.HierarchyScopeStmt(level=ir.Level.CORE_GROUP, name_hint="my_kernel", body=body, span=span)
         assert scope.name_hint == "my_kernel"
-        assert scope.scope_kind == ir.ScopeKind.InCore
+        assert scope.scope_kind == ir.ScopeKind.Hierarchy
 
-    def test_scope_stmt_default_name_is_empty(self):
+    def test_hierarchy_scope_default_name_is_empty(self):
         """Test that default name is empty string."""
         span = ir.Span("test.py", 1, 1, 1, 10)
         var_x = ir.Var("x", ir.TensorType([64], DataType.FP32), span)
         var_y = ir.Var("y", ir.TensorType([64], DataType.FP32), span)
         body = ir.AssignStmt(var_y, var_x, span)
 
-        scope = ir.InCoreScopeStmt(body=body, span=span)
+        scope = ir.HierarchyScopeStmt(level=ir.Level.CORE_GROUP, body=body, span=span)
         assert scope.name_hint == ""
 
     def test_spmd_scope_requires_positive_core_num(self):
@@ -101,6 +102,28 @@ def test_hierarchy_scope_typed_fields(self):
         assert scope.role == ir.Role.Worker
         assert scope.scope_kind == ir.ScopeKind.Hierarchy
 
+    def test_hierarchy_scope_split_at_core_group(self):
+        """HierarchyScopeStmt accepts split at CORE_GROUP."""
+        span = ir.Span("test.py", 1, 1, 1, 10)
+        var_x = ir.Var("x", ir.TensorType([64], DataType.FP32), span)
+        var_y = ir.Var("y", ir.TensorType([64], DataType.FP32), span)
+        body = ir.AssignStmt(var_y, var_x, span)
+
+        scope = ir.HierarchyScopeStmt(
+            level=ir.Level.CORE_GROUP, split=ir.SplitMode.UP_DOWN, body=body, span=span
+        )
+        assert scope.split == ir.SplitMode.UP_DOWN
+
+    def test_hierarchy_scope_split_rejected_at_non_core_group(self):
+        """HierarchyScopeStmt rejects split at levels other than CORE_GROUP."""
+        span = ir.Span("test.py", 1, 1, 1, 10)
+        var_x = ir.Var("x", ir.TensorType([64], DataType.FP32), span)
+        var_y = ir.Var("y", ir.TensorType([64], DataType.FP32), span)
+        body = ir.AssignStmt(var_y, var_x, span)
+
+        with pytest.raises(ValueError, match="split is only valid at Level::CORE_GROUP"):
+            ir.HierarchyScopeStmt(level=ir.Level.HOST, split=ir.SplitMode.UP_DOWN, body=body, span=span)
+
 
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])
diff --git a/tests/ut/ir/statements/test_scope_stmt_hierarchy.py b/tests/ut/ir/statements/test_scope_stmt_hierarchy.py
index eb0540e15..7f641e300 100644
--- a/tests/ut/ir/statements/test_scope_stmt_hierarchy.py
+++ b/tests/ut/ir/statements/test_scope_stmt_hierarchy.py
@@ -7,7 +7,7 @@
 # See LICENSE in the root of the software repository for the full text of the License.
 # -----------------------------------------------------------------------------------------------------------
 
-"""Tests for ScopeStmt Hierarchy kind (Step 03)."""
+"""Tests for the typed ScopeStmt class hierarchy."""
 
 import pypto.language as pl
 import pytest
@@ -22,7 +22,7 @@ def _span():
     return ir.Span("test", 1, 0)
 
 
-# ─── ScopeKind.Hierarchy value ────────────────────────────────────────────────
+# ─── ScopeKind values ────────────────────────────────────────────────────────
 
 
 def test_hierarchy_scope_kind_exists():
@@ -30,23 +30,16 @@ def test_hierarchy_scope_kind_exists():
     assert hasattr(ir.ScopeKind, "Hierarchy")
 
 
-def test_hierarchy_scope_kind_distinct():
-    """Hierarchy is distinct from existing ScopeKind values."""
-    assert ir.ScopeKind.Hierarchy != ir.ScopeKind.InCore
-    assert ir.ScopeKind.Hierarchy != ir.ScopeKind.AutoInCore
+def test_scope_kinds_are_distinct():
+    """Each surviving ScopeKind is distinct."""
     assert ir.ScopeKind.Hierarchy != ir.ScopeKind.Cluster
+    assert ir.ScopeKind.Hierarchy != ir.ScopeKind.Spmd
+    assert ir.ScopeKind.Cluster != ir.ScopeKind.Spmd
 
 
 # ─── Construction with derived classes (issue #1047) ────────────────────────
 
 
-def test_in_core_scope_construction():
-    """InCoreScopeStmt construction works."""
-    s = ir.InCoreScopeStmt(body=_empty_body(), span=_span())
-    assert s.scope_kind == ir.ScopeKind.InCore
-    assert isinstance(s, ir.ScopeStmt)
-
-
 def test_cluster_scope_construction():
     """ClusterScopeStmt construction works."""
     s = ir.ClusterScopeStmt(body=_empty_body(), span=_span())
@@ -114,11 +107,11 @@ def test_structural_equal_different_role():
 
 
 def test_structural_equal_different_kinds():
-    """Different scope kinds (InCore vs Hierarchy) compare as unequal."""
-    s_in = ir.InCoreScopeStmt(body=_empty_body(), span=_span())
+    """Different scope kinds compare as unequal."""
+    s_cluster = ir.ClusterScopeStmt(body=_empty_body(), span=_span())
     s_hier = ir.HierarchyScopeStmt(level=ir.Level.HOST, body=_empty_body(), span=_span())
     with pytest.raises(ValueError):
-        ir.assert_structural_equal(s_in, s_hier)
+        ir.assert_structural_equal(s_cluster, s_hier)
 
 
 # ─── Python printer ──────────────────────────────────────────────────────────
@@ -134,46 +127,59 @@ def test_printer_hierarchy_scope():
     assert "Role.Worker" in printed
 
 
-def test_printer_incore_scope_unchanged():
+def test_printer_core_group_scope():
     body = _empty_body()
-    scope = ir.InCoreScopeStmt(body=body, span=_span())
+    scope = ir.HierarchyScopeStmt(level=ir.Level.CORE_GROUP, body=body, span=_span())
     func = ir.Function("test_fn", [], [], scope, _span())
     printed = str(func)
     assert "pl.at(level=pl.Level.CORE_GROUP)" in printed
 
 
-def test_printer_incore_scope_with_split():
+def test_printer_core_group_scope_with_split():
     body = _empty_body()
-    scope = ir.InCoreScopeStmt(split=ir.SplitMode.UP_DOWN, body=body, span=_span())
+    scope = ir.HierarchyScopeStmt(
+        level=ir.Level.CORE_GROUP, split=ir.SplitMode.UP_DOWN, body=body, span=_span()
+    )
     func = ir.Function("test_fn", [], [], scope, _span())
     printed = str(func)
-    assert "pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN)" in printed
+    assert "pl.at(level=pl.Level.CORE_GROUP" in printed
+    assert "pl.split(pl.SplitMode.UP_DOWN)" in printed
 
 
-def test_scope_stmt_incore_with_split():
-    s = ir.InCoreScopeStmt(split=ir.SplitMode.UP_DOWN, body=_empty_body(), span=_span())
-    assert s.scope_kind == ir.ScopeKind.InCore
+def test_scope_stmt_core_group_with_split():
+    s = ir.HierarchyScopeStmt(
+        level=ir.Level.CORE_GROUP, split=ir.SplitMode.UP_DOWN, body=_empty_body(), span=_span()
+    )
+    assert s.scope_kind == ir.ScopeKind.Hierarchy
     assert s.split == ir.SplitMode.UP_DOWN
 
 
-def test_structural_equal_incore_with_split():
-    s1 = ir.InCoreScopeStmt(split=ir.SplitMode.UP_DOWN, body=_empty_body(), span=_span())
-    s2 = ir.InCoreScopeStmt(split=ir.SplitMode.UP_DOWN, body=_empty_body(), span=_span())
+def test_structural_equal_core_group_with_split():
+    s1 = ir.HierarchyScopeStmt(
+        level=ir.Level.CORE_GROUP, split=ir.SplitMode.UP_DOWN, body=_empty_body(), span=_span()
+    )
+    s2 = ir.HierarchyScopeStmt(
+        level=ir.Level.CORE_GROUP, split=ir.SplitMode.UP_DOWN, body=_empty_body(), span=_span()
+    )
     ir.assert_structural_equal(s1, s2)
 
 
-def test_structural_equal_incore_different_split():
-    s1 = ir.InCoreScopeStmt(split=ir.SplitMode.UP_DOWN, body=_empty_body(), span=_span())
-    s2 = ir.InCoreScopeStmt(split=ir.SplitMode.LEFT_RIGHT, body=_empty_body(), span=_span())
+def test_structural_equal_core_group_different_split():
+    s1 = ir.HierarchyScopeStmt(
+        level=ir.Level.CORE_GROUP, split=ir.SplitMode.UP_DOWN, body=_empty_body(), span=_span()
+    )
+    s2 = ir.HierarchyScopeStmt(
+        level=ir.Level.CORE_GROUP, split=ir.SplitMode.LEFT_RIGHT, body=_empty_body(), span=_span()
+    )
     with pytest.raises(ValueError):
         ir.assert_structural_equal(s1, s2)
 
 
-# ─── Outline pass safety ─────────────────────────────────────────────────────
+# ─── Outline pass ────────────────────────────────────────────────────────────
 
 
-def test_outline_incore_works_with_normal_program():
-    """OutlineIncoreScopes works normally on programs without Hierarchy scopes."""
+def test_outline_hierarchy_works_with_core_group_program():
+    """OutlineHierarchyScopes outlines CORE_GROUP scopes into Function(InCore)."""
 
     @pl.program
     class P:
@@ -183,22 +189,9 @@ def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
                 y = pl.add(x, x)
             return y
 
-    After = passes.outline_incore_scopes()(P)
+    After = passes.outline_hierarchy_scopes()(P)
     assert After is not None
 
 
-def test_scope_outliner_ignores_hierarchy_kind():
-    """ScopeOutliner (used by OutlineIncoreScopes) only targets its configured
-    ScopeKind and naturally ignores Hierarchy scopes via the ScopeKind check."""
-    # The ScopeOutliner matches on target_scope_kind_ (InCore or Cluster).
-    # ScopeKind::Hierarchy (value 3) != InCore (0) != Cluster (2), so
-    # the outliner's VisitStmt_ will skip it via: if (scope_kind_ != target_) return.
-    # We verify this property at the enum level since we can't inject a Hierarchy
-    # scope via the DSL parser yet (pl.at() parsing is Step 04).
-    assert ir.ScopeKind.Hierarchy != ir.ScopeKind.InCore
-    assert ir.ScopeKind.Hierarchy != ir.ScopeKind.Cluster
-    assert ir.ScopeKind.Hierarchy != ir.ScopeKind.AutoInCore
-
-
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])
diff --git a/tests/ut/ir/transforms/test_ctrl_flow_transform.py b/tests/ut/ir/transforms/test_ctrl_flow_transform.py
index 292b56e8e..8d8bf0e60 100644
--- a/tests/ut/ir/transforms/test_ctrl_flow_transform.py
+++ b/tests/ut/ir/transforms/test_ctrl_flow_transform.py
@@ -2023,13 +2023,14 @@ def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
             return x
 
     after_ssa = passes.convert_to_ssa()(Input)
-    after_outline = passes.outline_incore_scopes()(after_ssa)
+    after_outline_h = passes.outline_hierarchy_scopes()(after_ssa)
+    after_outline = passes.outline_incore_scopes()(after_outline_h)
     After = passes.ctrl_flow_transform()(after_outline)
 
     @pl.program
     class Expected:
-        @pl.function(type=pl.FunctionType.InCore, strict_ssa=True)
-        def main_incore_0(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:  # noqa: F841
+        @pl.function(type=pl.FunctionType.InCore, level=pl.Level.CORE_GROUP, strict_ssa=True)
+        def main_core_group_0(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:  # noqa: F841
             for i, (x_iter,) in pl.range(10, init_values=(x_0,)):
                 if i < 5:
                     phi: pl.Tensor[[64], pl.FP32] = pl.yield_(x_iter)
@@ -2041,7 +2042,7 @@ def main_incore_0(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP3
 
         @pl.function(type=pl.FunctionType.Orchestration)
         def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-            x_rv: pl.Tensor[[64], pl.FP32] = self.main_incore_0(x_0)
+            x_rv: pl.Tensor[[64], pl.FP32] = self.main_core_group_0(x_0)
             return x_rv
 
     ir.assert_structural_equal(After, Expected)
diff --git a/tests/ut/ir/transforms/test_fuse_create_assemble_to_slice.py b/tests/ut/ir/transforms/test_fuse_create_assemble_to_slice.py
index 540ce0c7a..c6f6e79a9 100644
--- a/tests/ut/ir/transforms/test_fuse_create_assemble_to_slice.py
+++ b/tests/ut/ir/transforms/test_fuse_create_assemble_to_slice.py
@@ -21,7 +21,6 @@ def _run_prereqs_only(program):
     pipeline.add_pass(passes.normalize_stmt_structure())
     pipeline.add_pass(passes.flatten_call_expr())
     pipeline.add_pass(passes.outline_hierarchy_scopes())
-    pipeline.add_pass(passes.outline_incore_scopes())
     pipeline.add_pass(passes.outline_cluster_scopes())
     ctx = passes.PassContext([], passes.VerificationLevel.NONE)
     with ctx:
@@ -35,7 +34,6 @@ def _run_prereqs_and_fuse(program):
     pipeline.add_pass(passes.normalize_stmt_structure())
     pipeline.add_pass(passes.flatten_call_expr())
     pipeline.add_pass(passes.outline_hierarchy_scopes())
-    pipeline.add_pass(passes.outline_incore_scopes())
     pipeline.add_pass(passes.outline_cluster_scopes())
     pipeline.add_pass(passes.fuse_create_assemble_to_slice())
     ctx = passes.PassContext([], passes.VerificationLevel.NONE)
diff --git a/tests/ut/ir/transforms/test_interchange_chunk_loops.py b/tests/ut/ir/transforms/test_interchange_chunk_loops.py
deleted file mode 100644
index 4e80963cf..000000000
--- a/tests/ut/ir/transforms/test_interchange_chunk_loops.py
+++ /dev/null
@@ -1,1144 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-
-"""Unit tests for InterchangeChunkLoops pass.
-
-Test strategy:
-  Build a `Before` program, run prerequisite passes + InterchangeChunkLoops,
-  and compare to an explicitly-constructed `Expected` program using
-  `ir.assert_structural_equal(..., enable_auto_mapping=True)`.
-"""
-
-import re
-
-import pypto.language as pl
-import pytest
-from pypto import ir, passes
-from pypto.ir.printer import python_print
-
-
-def _prepare_for_interchange(program):
-    """Run prerequisite passes to produce input for InterchangeChunkLoops."""
-    program = passes.unroll_loops()(program)
-    program = passes.convert_to_ssa()(program)
-    program = passes.flatten_call_expr()(program)
-    program = passes.split_chunked_loops()(program)
-    return program
-
-
-class TestSingleParallelChunk:
-    """Tests for single parallel chunked loop (1 outer + 1 inner, InCore wrapping only)."""
-
-    def test_single_parallel_chunk_gets_incore(self):
-        """Single parallel chunked loop: outer wraps InCore around inner."""
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                        x = pl.add(x, 1.0)
-                return x
-
-        @pl.program
-        class Expected:
-            @pl.function
-            def main(self, x0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                for i0, (x1,) in pl.parallel(
-                    2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                ):
-                    with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN):
-                        for i1, (x2,) in pl.parallel(
-                            4, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                        ):
-                            x3: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x2, 1.0)
-                            x4: pl.Tensor[[64], pl.FP32] = pl.yield_(x3)
-                    x5: pl.Tensor[[64], pl.FP32] = pl.yield_(x4)
-                return x5
-
-        After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before))
-        ir.assert_structural_equal(After, Expected, enable_auto_mapping=True)
-
-
-class TestNestedParallelChunks:
-    """Tests for nested parallel chunked loops (full interchange + InCore)."""
-
-    def test_two_nested_parallel_divisible(self):
-        """Two nested parallel chunked loops, divisible: full interchange + InCore."""
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                        for j in pl.parallel(0, 12, 1, chunk=4, chunk_policy="leading_full"):
-                            x = pl.add(x, 1.0)
-                return x
-
-        @pl.program
-        class Expected:
-            @pl.function
-            def main(self, x0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                for i0, (x1,) in pl.parallel(
-                    2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                ):
-                    for j0, (x2,) in pl.parallel(
-                        3, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                    ):
-                        with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN):
-                            for i1, (x3,) in pl.parallel(
-                                4, init_values=(x2,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                            ):
-                                for j1, (x4,) in pl.parallel(
-                                    4, init_values=(x3,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                                ):
-                                    x5: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x4, 1.0)
-                                    x6: pl.Tensor[[64], pl.FP32] = pl.yield_(x5)
-                                x7: pl.Tensor[[64], pl.FP32] = pl.yield_(x6)
-                        x8: pl.Tensor[[64], pl.FP32] = pl.yield_(x7)
-                    x9: pl.Tensor[[64], pl.FP32] = pl.yield_(x8)
-                return x9
-
-        After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before))
-        ir.assert_structural_equal(After, Expected, enable_auto_mapping=True)
-
-    def test_two_nested_parallel_with_iter_args(self):
-        """Two nested parallel chunked loops with iter_args: verify SSA threading.
-
-        Same Before as ``test_two_nested_parallel_divisible`` — this test also
-        structurally confirms that iter_args thread correctly through every
-        level of the interchanged nest.
-        """
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                        for j in pl.parallel(0, 12, 1, chunk=4, chunk_policy="leading_full"):
-                            x = pl.add(x, 1.0)
-                return x
-
-        @pl.program
-        class Expected:
-            @pl.function
-            def main(self, x0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                for i0, (x1,) in pl.parallel(
-                    2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                ):
-                    for j0, (x2,) in pl.parallel(
-                        3, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                    ):
-                        with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN):
-                            for i1, (x3,) in pl.parallel(
-                                4, init_values=(x2,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                            ):
-                                for j1, (x4,) in pl.parallel(
-                                    4, init_values=(x3,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                                ):
-                                    x5: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x4, 1.0)
-                                    x6: pl.Tensor[[64], pl.FP32] = pl.yield_(x5)
-                                x7: pl.Tensor[[64], pl.FP32] = pl.yield_(x6)
-                        x8: pl.Tensor[[64], pl.FP32] = pl.yield_(x7)
-                    x9: pl.Tensor[[64], pl.FP32] = pl.yield_(x8)
-                return x9
-
-        After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before))
-        ir.assert_structural_equal(After, Expected, enable_auto_mapping=True)
-
-
-class TestNestedChunkChainsInitSubstitution:
-    """Tests that nested chunk chains correctly substitute init_values from parent chain."""
-
-    def test_nested_chains_init_values_substituted(self):
-        """Nested parallel chunk chains: inner chain init_values reference parent's
-        rewritten iter_args, not the original pre-interchange names."""
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(
-                self,
-                x: pl.Tensor[[64], pl.FP32],
-                y: pl.Tensor[[64], pl.FP32],
-            ) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for b in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                        for h in pl.parallel(0, 12, 1, chunk=4, chunk_policy="leading_full"):
-                            x = pl.add(x, y)
-                return x
-
-        @pl.program
-        class Expected:
-            @pl.function
-            def main(
-                self,
-                x0: pl.Tensor[[64], pl.FP32],
-                y0: pl.Tensor[[64], pl.FP32],
-            ) -> pl.Tensor[[64], pl.FP32]:
-                for b0, (x1,) in pl.parallel(
-                    2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                ):
-                    for h0, (x2,) in pl.parallel(
-                        3, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                    ):
-                        with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN):
-                            for b1, (x3,) in pl.parallel(
-                                4, init_values=(x2,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                            ):
-                                for h1, (x4,) in pl.parallel(
-                                    4, init_values=(x3,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                                ):
-                                    x5: pl.Tensor[[64], pl.FP32] = pl.tensor.add(x4, y0)
-                                    x6: pl.Tensor[[64], pl.FP32] = pl.yield_(x5)
-                                x7: pl.Tensor[[64], pl.FP32] = pl.yield_(x6)
-                        x8: pl.Tensor[[64], pl.FP32] = pl.yield_(x7)
-                    x9: pl.Tensor[[64], pl.FP32] = pl.yield_(x8)
-                return x9
-
-        After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before))
-        ir.assert_structural_equal(After, Expected, enable_auto_mapping=True)
-
-    def test_nested_chains_outline_no_crash(self):
-        """Nested parallel chunk chains followed by OutlineIncoreScopes must not crash.
-
-        This is the end-to-end scenario from DeepSeekV3 decode that triggered the
-        'Variable ... not found in symbol table' crash.
-        """
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(
-                self,
-                x: pl.Tensor[[64], pl.FP32],
-                y: pl.Tensor[[64], pl.FP32],
-            ) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for b in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                        for h in pl.parallel(0, 12, 1, chunk=4, chunk_policy="leading_full"):
-                            x = pl.add(x, y)
-                return x
-
-        program = _prepare_for_interchange(Input)
-        program = passes.interchange_chunk_loops()(program)
-        # This should not raise "Variable ... not found in symbol table"
-        program = passes.outline_incore_scopes()(program)
-
-        incore_funcs = [f for f in program.functions.values() if f.func_type == ir.FunctionType.InCore]
-        assert len(incore_funcs) >= 1
-
-    def test_nested_chains_with_remainder_outline_no_crash(self):
-        """Nested chains with remainder: outline must not crash on substituted init_values."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(
-                self,
-                x: pl.Tensor[[64], pl.FP32],
-                y: pl.Tensor[[64], pl.FP32],
-            ) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for b in pl.parallel(0, 6, 1, chunk=4, chunk_policy="leading_full"):
-                        for h in pl.parallel(0, 14, 1, chunk=4, chunk_policy="leading_full"):
-                            x = pl.add(x, y)
-                return x
-
-        program = _prepare_for_interchange(Input)
-        program = passes.interchange_chunk_loops()(program)
-        program = passes.outline_incore_scopes()(program)
-
-        incore_funcs = [f for f in program.functions.values() if f.func_type == ir.FunctionType.InCore]
-        assert len(incore_funcs) >= 1
-
-
-class TestNestedChunksWithInterveningStatements:
-    """Tests for nested chunked parallel loops with intervening statements (issue #911)."""
-
-    @staticmethod
-    def _make_input():
-        @pl.program
-        class Input:
-            @pl.function
-            def main(
-                self,
-                x: pl.Tensor[[64], pl.FP32],
-                y: pl.Tensor[[64], pl.FP32],
-            ) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for b in pl.parallel(0, 16, 1, chunk=4, chunk_policy="leading_full"):
-                        x = pl.add(x, y)
-                        for h in pl.parallel(0, 8, 1, chunk=2, chunk_policy="leading_full"):
-                            x = pl.add(x, y)
-                return x
-
-        return Input
-
-    def test_no_nested_incore_with_intervening_stmt(self):
-        """Nested chunks with intervening add: single InCore, no nesting."""
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(
-                self,
-                x: pl.Tensor[[64], pl.FP32],
-                y: pl.Tensor[[64], pl.FP32],
-            ) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for b in pl.parallel(0, 16, 1, chunk=4, chunk_policy="leading_full"):
-                        x = pl.add(x, y)
-                        for h in pl.parallel(0, 8, 1, chunk=2, chunk_policy="leading_full"):
-                            x = pl.add(x, y)
-                return x
-
-        @pl.program
-        class Expected:
-            @pl.function
-            def main(
-                self,
-                x0: pl.Tensor[[64], pl.FP32],
-                y0: pl.Tensor[[64], pl.FP32],
-            ) -> pl.Tensor[[64], pl.FP32]:
-                for b0, (x1,) in pl.parallel(
-                    4, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                ):
-                    with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN):
-                        for b1, (x2,) in pl.parallel(
-                            4, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                        ):
-                            x3: pl.Tensor[[64], pl.FP32] = pl.tensor.add(x2, y0)
-                            for h0, (x4,) in pl.parallel(
-                                4, init_values=(x3,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                            ):
-                                for h1, (x5,) in pl.parallel(
-                                    2, init_values=(x4,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                                ):
-                                    x6: pl.Tensor[[64], pl.FP32] = pl.tensor.add(x5, y0)
-                                    x7: pl.Tensor[[64], pl.FP32] = pl.yield_(x6)
-                                x8: pl.Tensor[[64], pl.FP32] = pl.yield_(x7)
-                            x9: pl.Tensor[[64], pl.FP32] = pl.yield_(x8)
-                    x10: pl.Tensor[[64], pl.FP32] = pl.yield_(x9)
-                return x10
-
-        After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before))
-        ir.assert_structural_equal(After, Expected, enable_auto_mapping=True)
-
-    def test_outline_no_crash_with_intervening_stmt(self):
-        """Nested chunks with intervening stmt: outline must not crash."""
-        program = _prepare_for_interchange(self._make_input())
-        program = passes.interchange_chunk_loops()(program)
-        # This must not crash with nested InCore or missing operator
-        program = passes.outline_incore_scopes()(program)
-
-        incore_funcs = [f for f in program.functions.values() if f.func_type == ir.FunctionType.InCore]
-        assert len(incore_funcs) >= 1
-
-
-class TestChunkWithRemainderInChain:
-    """Tests for chunk chains that include remainder loops (non-divisible inner)."""
-
-    def test_chunk_outer_inner_with_remainder_preserves_iter_args(self):
-        """Chunk chain with trailing remainder: iter_args thread through inner, remainder preserved."""
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                        for j in pl.parallel(0, 1, 1, chunk=2, chunk_policy="leading_full"):
-                            x = pl.add(x, 1.0)
-                return x
-
-        @pl.program
-        class Expected:
-            @pl.function
-            def main(self, x0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                for i0, (x1,) in pl.parallel(
-                    2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                ):
-                    with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN):
-                        for i1, (x2,) in pl.parallel(
-                            4, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                        ):
-                            for j0, (x3,) in pl.parallel(
-                                1, init_values=(x2,), attrs={"loop_origin": pl.LoopOrigin.ChunkRemainder}
-                            ):
-                                x4: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x3, 1.0)
-                                x5: pl.Tensor[[64], pl.FP32] = pl.yield_(x4)
-                            x6: pl.Tensor[[64], pl.FP32] = pl.yield_(x5)
-                    x7: pl.Tensor[[64], pl.FP32] = pl.yield_(x6)
-                return x7
-
-        After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before))
-        ir.assert_structural_equal(After, Expected, enable_auto_mapping=True)
-
-    def test_chunk_with_remainder_body_contains_remainder_loop(self):
-        """Remainder loop inside chain body is preserved after interchange.
-
-        Same Before as ``test_chunk_outer_inner_with_remainder_preserves_iter_args``
-        — the matching Expected confirms the remainder loop structurally survives.
-        """
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                        for j in pl.parallel(0, 1, 1, chunk=2, chunk_policy="leading_full"):
-                            x = pl.add(x, 1.0)
-                return x
-
-        @pl.program
-        class Expected:
-            @pl.function
-            def main(self, x0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                for i0, (x1,) in pl.parallel(
-                    2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                ):
-                    with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN):
-                        for i1, (x2,) in pl.parallel(
-                            4, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                        ):
-                            for j0, (x3,) in pl.parallel(
-                                1, init_values=(x2,), attrs={"loop_origin": pl.LoopOrigin.ChunkRemainder}
-                            ):
-                                x4: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x3, 1.0)
-                                x5: pl.Tensor[[64], pl.FP32] = pl.yield_(x4)
-                            x6: pl.Tensor[[64], pl.FP32] = pl.yield_(x5)
-                    x7: pl.Tensor[[64], pl.FP32] = pl.yield_(x6)
-                return x7
-
-        After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before))
-        ir.assert_structural_equal(After, Expected, enable_auto_mapping=True)
-
-
-class TestRemainderLoops:
-    """Tests for non-divisible cases with remainder loops."""
-
-    def test_non_divisible_with_remainder(self):
-        """Non-divisible with remainder: main chunk gets interchange, remainder gets InCore."""
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i in pl.parallel(0, 6, 1, chunk=4, chunk_policy="leading_full"):
-                        for j in pl.parallel(0, 14, 1, chunk=4, chunk_policy="leading_full"):
-                            x = pl.add(x, 1.0)
-                return x
-
-        @pl.program
-        class Expected:
-            @pl.function
-            def main(self, x0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                for i0, (x1,) in pl.parallel(
-                    1, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                ):
-                    with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN):
-                        for i1, (x2,) in pl.parallel(
-                            4, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                        ):
-                            for j0, (x3,) in pl.parallel(
-                                3, init_values=(x2,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                            ):
-                                for j1, (x4,) in pl.parallel(
-                                    4, init_values=(x3,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                                ):
-                                    x5: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x4, 1.0)
-                                    x6: pl.Tensor[[64], pl.FP32] = pl.yield_(x5)
-                                x7: pl.Tensor[[64], pl.FP32] = pl.yield_(x6)
-                            for j2, (x8,) in pl.parallel(
-                                2, init_values=(x7,), attrs={"loop_origin": pl.LoopOrigin.ChunkRemainder}
-                            ):
-                                x9: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x8, 1.0)
-                                x10: pl.Tensor[[64], pl.FP32] = pl.yield_(x9)
-                            x11: pl.Tensor[[64], pl.FP32] = pl.yield_(x10)
-                    x12: pl.Tensor[[64], pl.FP32] = pl.yield_(x11)
-                for i2, (x13,) in pl.parallel(
-                    2, init_values=(x12,), attrs={"loop_origin": pl.LoopOrigin.ChunkRemainder}
-                ):
-                    for j3, (x14,) in pl.parallel(
-                        3, init_values=(x13,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                    ):
-                        with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN):
-                            for j4, (x15,) in pl.parallel(
-                                4, init_values=(x14,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                            ):
-                                x16: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x15, 1.0)
-                                x17: pl.Tensor[[64], pl.FP32] = pl.yield_(x16)
-                        x18: pl.Tensor[[64], pl.FP32] = pl.yield_(x17)
-                    with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN):
-                        for j5, (x19,) in pl.parallel(
-                            2, init_values=(x18,), attrs={"loop_origin": pl.LoopOrigin.ChunkRemainder}
-                        ):
-                            x20: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x19, 1.0)
-                            x21: pl.Tensor[[64], pl.FP32] = pl.yield_(x20)
-                    x22: pl.Tensor[[64], pl.FP32] = pl.yield_(x21)
-                return x22
-
-        After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before))
-        ir.assert_structural_equal(After, Expected, enable_auto_mapping=True)
-
-
-class TestNonChunkedLoops:
-    """Tests for loops that should pass through unchanged."""
-
-    def test_non_chunked_loop_unchanged(self):
-        """Regular (non-chunked) loops pass through untouched."""
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                for i in pl.range(0, 10, 1):
-                    x = pl.add(x, 1.0)
-                return x
-
-        @pl.program
-        class Expected:
-            @pl.function
-            def main(self, x0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                for i0, (x1,) in pl.range(10, init_values=(x0,)):
-                    x2: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x1, 1.0)
-                    x3: pl.Tensor[[64], pl.FP32] = pl.yield_(x2)
-                return x3
-
-        After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before))
-        ir.assert_structural_equal(After, Expected, enable_auto_mapping=True)
-
-
-class TestSequentialChunks:
-    """Tests for sequential chunked loops (should NOT interchange but get InCore wrapping)."""
-
-    def test_sequential_chunk_gets_incore(self):
-        """Sequential chunked loop inside auto_incore: gets InCore wrapping."""
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i in pl.range(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                        x = pl.add(x, 1.0)
-                return x
-
-        @pl.program
-        class Expected:
-            @pl.function
-            def main(self, x0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN):
-                    for i0, (x1,) in pl.range(
-                        2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                    ):
-                        for i1, (x2,) in pl.range(
-                            4, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                        ):
-                            x3: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x2, 1.0)
-                            x4: pl.Tensor[[64], pl.FP32] = pl.yield_(x3)
-                        x5: pl.Tensor[[64], pl.FP32] = pl.yield_(x4)
-                return x5
-
-        After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before))
-        ir.assert_structural_equal(After, Expected, enable_auto_mapping=True)
-
-    def test_nested_sequential_chunks_get_incore(self):
-        """Nested sequential chunked loops: no interchange, but get InCore wrapping."""
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i in pl.range(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                        for j in pl.range(0, 12, 1, chunk=4, chunk_policy="leading_full"):
-                            x = pl.add(x, 1.0)
-                return x
-
-        @pl.program
-        class Expected:
-            @pl.function
-            def main(self, x0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN):
-                    for i0, (x1,) in pl.range(
-                        2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                    ):
-                        for i1, (x2,) in pl.range(
-                            4, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                        ):
-                            for j0, (x3,) in pl.range(
-                                3, init_values=(x2,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                            ):
-                                for j1, (x4,) in pl.range(
-                                    4, init_values=(x3,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                                ):
-                                    x5: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x4, 1.0)
-                                    x6: pl.Tensor[[64], pl.FP32] = pl.yield_(x5)
-                                x7: pl.Tensor[[64], pl.FP32] = pl.yield_(x6)
-                            x8: pl.Tensor[[64], pl.FP32] = pl.yield_(x7)
-                        x9: pl.Tensor[[64], pl.FP32] = pl.yield_(x8)
-                return x9
-
-        After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before))
-        ir.assert_structural_equal(After, Expected, enable_auto_mapping=True)
-
-
-class TestExistingInCore:
-    """Tests for loops with existing InCore scope (should skip interchange)."""
-
-    def test_existing_incore_skip(self):
-        """Body already has ScopeStmt(InCore): pass through unchanged by interchange."""
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                        with pl.at(level=pl.Level.CORE_GROUP):
-                            x = pl.add(x, 1.0)
-                return x
-
-        @pl.program
-        class Expected:
-            @pl.function
-            def main(self, x0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                for i0, (x1,) in pl.parallel(
-                    2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                ):
-                    for i1, (x2,) in pl.parallel(
-                        4, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                    ):
-                        with pl.at(level=pl.Level.CORE_GROUP):
-                            x3: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x2, 1.0)
-                        x4: pl.Tensor[[64], pl.FP32] = pl.yield_(x3)
-                    x5: pl.Tensor[[64], pl.FP32] = pl.yield_(x4)
-                return x5
-
-        After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before))
-        ir.assert_structural_equal(After, Expected, enable_auto_mapping=True)
-
-
-class TestAutoIncoreConsumed:
-    """Tests that auto_incore scope is consumed by InterchangeChunkLoops."""
-
-    def test_auto_incore_consumed(self):
-        """AutoInCore scope should be removed after InterchangeChunkLoops.
-
-        Same Before as ``TestSingleParallelChunk::test_single_parallel_chunk_gets_incore``
-        — the Expected has no ``chunked_loop_optimizer`` marker, structurally
-        asserting the AutoInCore scope was consumed.
-        """
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                        x = pl.add(x, 1.0)
-                return x
-
-        @pl.program
-        class Expected:
-            @pl.function
-            def main(self, x0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                for i0, (x1,) in pl.parallel(
-                    2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                ):
-                    with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN):
-                        for i1, (x2,) in pl.parallel(
-                            4, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                        ):
-                            x3: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x2, 1.0)
-                            x4: pl.Tensor[[64], pl.FP32] = pl.yield_(x3)
-                    x5: pl.Tensor[[64], pl.FP32] = pl.yield_(x4)
-                return x5
-
-        After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before))
-        ir.assert_structural_equal(After, Expected, enable_auto_mapping=True)
-
-
-class TestPassProperties:
-    """Tests for pass properties and factory."""
-
-    def test_pass_name(self):
-        """Pass has correct name."""
-        p = passes.interchange_chunk_loops()
-        assert p.get_name() == "InterchangeChunkLoops"
-
-    def test_pass_required_properties(self):
-        """Pass requires SSAForm (TypeChecked is a structural property)."""
-        p = passes.interchange_chunk_loops()
-        req = p.get_required_properties()
-        assert req.contains(passes.IRProperty.SSAForm)
-
-    def test_pass_produced_properties(self):
-        """Pass produces SSAForm (TypeChecked is a structural property)."""
-        p = passes.interchange_chunk_loops()
-        prod = p.get_produced_properties()
-        assert prod.contains(passes.IRProperty.SSAForm)
-
-
-class TestNoNestedIncoreVerifier:
-    """Tests for the NoNestedInCore structural property verifier (issue #912)."""
-
-    def test_no_nested_incore_is_structural_property(self):
-        """NoNestedInCore is in the structural property set."""
-        structural = passes.get_structural_properties()
-        assert structural.contains(passes.IRProperty.NoNestedInCore)
-
-    def test_verifier_passes_on_valid_ir(self):
-        """Verifier passes when InterchangeChunkLoops produces valid (non-nested) InCore."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                        x = pl.add(x, 1.0)
-                return x
-
-        program = _prepare_for_interchange(Input)
-        program = passes.interchange_chunk_loops()(program)
-
-        props = passes.IRPropertySet()
-        props.insert(passes.IRProperty.NoNestedInCore)
-        diagnostics = passes.PropertyVerifierRegistry.verify(props, program)
-        errors = [d for d in diagnostics if d.severity == passes.DiagnosticSeverity.Error]
-        assert len(errors) == 0
-
-    def test_verifier_passes_with_intervening_stmts(self):
-        """Verifier passes on fixed nested chunks with intervening statements."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(
-                self,
-                x: pl.Tensor[[64], pl.FP32],
-                y: pl.Tensor[[64], pl.FP32],
-            ) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for b in pl.parallel(0, 16, 1, chunk=4, chunk_policy="leading_full"):
-                        x = pl.add(x, y)
-                        for h in pl.parallel(0, 8, 1, chunk=2, chunk_policy="leading_full"):
-                            x = pl.add(x, y)
-                return x
-
-        program = _prepare_for_interchange(Input)
-        program = passes.interchange_chunk_loops()(program)
-
-        props = passes.IRPropertySet()
-        props.insert(passes.IRProperty.NoNestedInCore)
-        diagnostics = passes.PropertyVerifierRegistry.verify(props, program)
-        errors = [d for d in diagnostics if d.severity == passes.DiagnosticSeverity.Error]
-        assert len(errors) == 0
-
-
-class TestNonChunkStatementsWrapping:
-    """Tests that non-chunk statements inside auto_incore get InCore wrapping."""
-
-    def test_standalone_tensor_op_wrapped(self):
-        """Standalone tensor op inside auto_incore gets wrapped in InCore."""
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    x = pl.add(x, 1.0)
-                return x
-
-        @pl.program
-        class Expected:
-            @pl.function
-            def main(self, x0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN):
-                    x1: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x0, 1.0)
-                return x1
-
-        After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before))
-        ir.assert_structural_equal(After, Expected, enable_auto_mapping=True)
-
-    def test_standalone_op_before_parallel_chunk(self):
-        """Standalone op before parallel chunk: op wrapped separately, chunk interchanged."""
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    x = pl.add(x, 1.0)
-                    for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                        x = pl.add(x, 2.0)
-                return x
-
-        @pl.program
-        class Expected:
-            @pl.function
-            def main(self, x0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN):
-                    x1: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x0, 1.0)
-                for i0, (x2,) in pl.parallel(
-                    2, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                ):
-                    with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN):
-                        for i1, (x3,) in pl.parallel(
-                            4, init_values=(x2,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                        ):
-                            x4: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x3, 2.0)
-                            x5: pl.Tensor[[64], pl.FP32] = pl.yield_(x4)
-                    x6: pl.Tensor[[64], pl.FP32] = pl.yield_(x5)
-                return x6
-
-        After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before))
-        ir.assert_structural_equal(After, Expected, enable_auto_mapping=True)
-
-    def test_standalone_op_after_parallel_chunk(self):
-        """Standalone op after parallel chunk: chunk interchanged, op wrapped separately."""
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                        x = pl.add(x, 2.0)
-                    x = pl.mul(x, 3.0)
-                return x
-
-        @pl.program
-        class Expected:
-            @pl.function
-            def main(self, x0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                for i0, (x1,) in pl.parallel(
-                    2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                ):
-                    with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN):
-                        for i1, (x2,) in pl.parallel(
-                            4, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                        ):
-                            x3: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x2, 2.0)
-                            x4: pl.Tensor[[64], pl.FP32] = pl.yield_(x3)
-                    x5: pl.Tensor[[64], pl.FP32] = pl.yield_(x4)
-                with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN):
-                    x6: pl.Tensor[[64], pl.FP32] = pl.tensor.muls(x5, 3.0)
-                return x6
-
-        After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before))
-        ir.assert_structural_equal(After, Expected, enable_auto_mapping=True)
-
-    def test_host_side_assemble_after_parallel_chunk_not_wrapped(self):
-        """Host-side tail assemble after a chunk stays outside InCore."""
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(self, x: pl.Tensor[[4], pl.FP32]) -> pl.Tensor[[8], pl.FP32]:
-                out_0: pl.Tensor[[8], pl.FP32] = pl.tensor.create(
-                    [8], dtype=pl.FP32, layout=pl.TensorLayout.ND
-                )
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i in pl.parallel(0, 4, 1, chunk=2, chunk_policy="leading_full"):
-                        x = pl.tensor.adds(x, 1.0)
-                    out_1: pl.Tensor[[8], pl.FP32] = pl.tensor.assemble(out_0, x, [0])
-                return out_1
-
-        @pl.program
-        class Expected:
-            @pl.function
-            def main(self, x0: pl.Tensor[[4], pl.FP32]) -> pl.Tensor[[8], pl.FP32]:
-                out_0_0: pl.Tensor[[8], pl.FP32] = pl.tensor.create(
-                    [8], dtype=pl.FP32, layout=pl.TensorLayout.ND
-                )
-                for i0, (x1,) in pl.parallel(
-                    2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                ):
-                    with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN):
-                        for i1, (x2,) in pl.parallel(
-                            2, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                        ):
-                            x3: pl.Tensor[[4], pl.FP32] = pl.tensor.adds(x2, 1.0)
-                            x4: pl.Tensor[[4], pl.FP32] = pl.yield_(x3)
-                    x5: pl.Tensor[[4], pl.FP32] = pl.yield_(x4)
-                out_1_0: pl.Tensor[[8], pl.FP32] = pl.tensor.assemble(out_0_0, x5, [0])
-                return out_1_0
-
-        After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before))
-        ir.assert_structural_equal(After, Expected, enable_auto_mapping=True)
-
-    def test_multiple_parallel_chunks_no_regression(self):
-        """Multiple parallel chunks with no standalone ops: all interchanged, no extra wrapping."""
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                        x = pl.add(x, 1.0)
-                    for j in pl.parallel(0, 12, 1, chunk=4, chunk_policy="leading_full"):
-                        x = pl.mul(x, 2.0)
-                return x
-
-        @pl.program
-        class Expected:
-            @pl.function
-            def main(self, x0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                for i0, (x1,) in pl.parallel(
-                    2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                ):
-                    with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN):
-                        for i1, (x2,) in pl.parallel(
-                            4, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                        ):
-                            x3: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x2, 1.0)
-                            x4: pl.Tensor[[64], pl.FP32] = pl.yield_(x3)
-                    x5: pl.Tensor[[64], pl.FP32] = pl.yield_(x4)
-                for j0, (x6,) in pl.parallel(
-                    3, init_values=(x5,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                ):
-                    with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN):
-                        for j1, (x7,) in pl.parallel(
-                            4, init_values=(x6,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                        ):
-                            x8: pl.Tensor[[64], pl.FP32] = pl.tensor.muls(x7, 2.0)
-                            x9: pl.Tensor[[64], pl.FP32] = pl.yield_(x8)
-                    x10: pl.Tensor[[64], pl.FP32] = pl.yield_(x9)
-                return x10
-
-        After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before))
-        ir.assert_structural_equal(After, Expected, enable_auto_mapping=True)
-
-    def test_non_chunked_loop_inside_auto_incore_wrapped(self):
-        """Non-chunked loop with tensor ops inside auto_incore gets wrapped in InCore."""
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i in pl.range(10):
-                        x = pl.add(x, 1.0)
-                return x
-
-        @pl.program
-        class Expected:
-            @pl.function
-            def main(self, x0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN):
-                    for i0, (x1,) in pl.range(10, init_values=(x0,)):
-                        x2: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x1, 1.0)
-                        x3: pl.Tensor[[64], pl.FP32] = pl.yield_(x2)
-                return x3
-
-        After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before))
-        ir.assert_structural_equal(After, Expected, enable_auto_mapping=True)
-
-    def test_mixed_parallel_and_sequential_chunks(self):
-        """Mixed parallel chunk + sequential chunk: parallel interchanged, sequential wrapped."""
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                        x = pl.add(x, 1.0)
-                    for j in pl.range(0, 12, 1, chunk=4, chunk_policy="leading_full"):
-                        x = pl.mul(x, 2.0)
-                return x
-
-        @pl.program
-        class Expected:
-            @pl.function
-            def main(self, x0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                for i0, (x1,) in pl.parallel(
-                    2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                ):
-                    with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN):
-                        for i1, (x2,) in pl.parallel(
-                            4, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                        ):
-                            x3: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x2, 1.0)
-                            x4: pl.Tensor[[64], pl.FP32] = pl.yield_(x3)
-                    x5: pl.Tensor[[64], pl.FP32] = pl.yield_(x4)
-                with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN):
-                    for j0, (x6,) in pl.range(
-                        3, init_values=(x5,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                    ):
-                        for j1, (x7,) in pl.range(
-                            4, init_values=(x6,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                        ):
-                            x8: pl.Tensor[[64], pl.FP32] = pl.tensor.muls(x7, 2.0)
-                            x9: pl.Tensor[[64], pl.FP32] = pl.yield_(x8)
-                        x10: pl.Tensor[[64], pl.FP32] = pl.yield_(x9)
-                return x10
-
-        After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before))
-        ir.assert_structural_equal(After, Expected, enable_auto_mapping=True)
-
-
-class TestScalarAssignmentNotWrapped:
-    """Tests that pure scalar assignments stay outside InCore scopes."""
-
-    def test_scalar_assign_adjacent_to_compute_not_wrapped(self):
-        """Scalar assignment adjacent to tensor compute ops stays in orchestration."""
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for ob in pl.range(0, 8):
-                        offset: pl.Scalar[pl.INDEX] = ob * 4  # noqa: F841
-                        x = pl.add(x, 1.0)
-                        for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                            x = pl.add(x, 2.0)
-                return x
-
-        @pl.program
-        class Expected:
-            @pl.function
-            def main(self, x0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                for ob0, (x1,) in pl.range(8, init_values=(x0,)):
-                    offset0: pl.Scalar[pl.INDEX] = ob0 * 4
-                    with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN):
-                        x2: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x1, 1.0)
-                    for i0, (x3,) in pl.parallel(
-                        2, init_values=(x2,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                    ):
-                        with pl.at(level=pl.Level.CORE_GROUP, split=pl.SplitMode.UP_DOWN):
-                            for i1, (x4,) in pl.parallel(
-                                4, init_values=(x3,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                            ):
-                                x5: pl.Tensor[[64], pl.FP32] = pl.tensor.adds(x4, 2.0)
-                                x6: pl.Tensor[[64], pl.FP32] = pl.yield_(x5)
-                        x7: pl.Tensor[[64], pl.FP32] = pl.yield_(x6)
-                    x8: pl.Tensor[[64], pl.FP32] = pl.yield_(x7)
-                return x8
-
-        After = passes.interchange_chunk_loops()(_prepare_for_interchange(Before))
-        ir.assert_structural_equal(After, Expected, enable_auto_mapping=True)
-
-    def test_scalar_assign_not_wrapped_outline_no_crash(self):
-        """Scalar assignment stays in orchestration after outline — no undefined variable."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for ob in pl.range(0, 8):
-                        offset: pl.Scalar[pl.INDEX] = ob * 4  # noqa: F841
-                        for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                            x = pl.add(x, 2.0)
-                return x
-
-        program = _prepare_for_interchange(Input)
-        program = passes.interchange_chunk_loops()(program)
-        # This should not crash with undefined variable references
-        program = passes.outline_incore_scopes()(program)
-
-        incore_funcs = [f for f in program.functions.values() if f.func_type == ir.FunctionType.InCore]
-        assert len(incore_funcs) >= 1
-
-
-class TestEndToEndNoComputeLeaks:
-    """End-to-end tests verifying no compute tensor ops leak into Orchestration."""
-
-    def _run_through_outline(self, program):
-        """Run prerequisite passes + interchange + outline."""
-        program = _prepare_for_interchange(program)
-        program = passes.interchange_chunk_loops()(program)
-        program = passes.outline_incore_scopes()(program)
-        return program
-
-    # Host-side tensor ops that are allowed in Orchestration
-    _HOST_SIDE_OPS = {
-        "tensor.create",
-        "tensor.read",
-        "tensor.write",
-        "tensor.slice",
-        "tensor.assemble",
-        "tensor.dim",
-        "tensor.reshape",
-        "tensor.transpose",
-    }
-
-    def _assert_no_compute_leaks(self, program, min_incore_funcs=1):
-        """Assert no compute tensor ops in Orchestration and enough InCore functions exist."""
-        for func in program.functions.values():
-            if func.func_type == ir.FunctionType.Orchestration:
-                func_str = python_print(func)
-                for match in re.findall(r"tensor\.\w+", func_str):
-                    assert match in self._HOST_SIDE_OPS, (
-                        f"Compute tensor op '{match}' leaked into Orchestration"
-                    )
-
-        incore_funcs = [f for f in program.functions.values() if f.func_type == ir.FunctionType.InCore]
-        assert len(incore_funcs) >= min_incore_funcs
-
-    def test_standalone_op_outlined(self):
-        """Standalone op inside auto_incore: outlined into InCore function."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    x = pl.add(x, 1.0)
-                return x
-
-        After = self._run_through_outline(Input)
-        self._assert_no_compute_leaks(After, min_incore_funcs=1)
-
-    def test_mix_standalone_and_parallel_chunk_outlined(self):
-        """Mix of standalone + parallel chunk: two InCore functions, orchestration clean."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    x = pl.add(x, 1.0)
-                    for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                        x = pl.add(x, 2.0)
-                return x
-
-        After = self._run_through_outline(Input)
-        self._assert_no_compute_leaks(After, min_incore_funcs=2)
-
-    def test_sequential_chunk_outlined(self):
-        """Sequential chunk inside auto_incore: one InCore function containing the whole loop chain."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i in pl.range(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                        x = pl.add(x, 1.0)
-                return x
-
-        After = self._run_through_outline(Input)
-        self._assert_no_compute_leaks(After, min_incore_funcs=1)
-
-
-if __name__ == "__main__":
-    pytest.main([__file__, "-v"])
diff --git a/tests/ut/ir/transforms/test_ir_property.py b/tests/ut/ir/transforms/test_ir_property.py
index 811d82d99..6f22a73ae 100644
--- a/tests/ut/ir/transforms/test_ir_property.py
+++ b/tests/ut/ir/transforms/test_ir_property.py
@@ -23,7 +23,7 @@ def test_property_values_exist(self):
         assert passes.IRProperty.NoNestedCalls is not None
         assert passes.IRProperty.NormalizedStmtStructure is not None
         assert passes.IRProperty.NoRedundantBlocks is not None
-        assert passes.IRProperty.SplitIncoreOrch is not None
+        assert passes.IRProperty.HierarchyOutlined is not None
         assert passes.IRProperty.HasMemRefs is not None
 
     def test_property_values_are_different(self):
@@ -34,7 +34,7 @@ def test_property_values_are_different(self):
             passes.IRProperty.NoNestedCalls,
             passes.IRProperty.NormalizedStmtStructure,
             passes.IRProperty.NoRedundantBlocks,
-            passes.IRProperty.SplitIncoreOrch,
+            passes.IRProperty.HierarchyOutlined,
             passes.IRProperty.HasMemRefs,
         ]
         assert len(props) == len(set(props))
@@ -185,12 +185,23 @@ def test_flatten_call_expr_requires_and_produces_ssa(self):
         assert p.get_produced_properties().contains(passes.IRProperty.SSAForm)
         assert p.get_produced_properties().contains(passes.IRProperty.NoNestedCalls)
 
-    def test_outline_incore_requires_and_produces_ssa(self):
-        """Test OutlineIncoreScopes requires and produces SSAForm."""
+    def test_outline_hierarchy_requires_and_produces_ssa(self):
+        """Test OutlineHierarchyScopes requires and produces SSAForm.
+
+        HierarchyOutlined is *not* produced here — CORE_GROUP scopes survive this
+        pass and are outlined by OutlineIncoreScopes, which produces the property.
+        """
+        p = passes.outline_hierarchy_scopes()
+        assert p.get_required_properties().contains(passes.IRProperty.SSAForm)
+        assert p.get_produced_properties().contains(passes.IRProperty.SSAForm)
+        assert not p.get_produced_properties().contains(passes.IRProperty.HierarchyOutlined)
+
+    def test_outline_incore_requires_ssa_produces_hierarchy_outlined(self):
+        """OutlineIncoreScopes requires SSAForm and produces SSAForm + HierarchyOutlined."""
         p = passes.outline_incore_scopes()
         assert p.get_required_properties().contains(passes.IRProperty.SSAForm)
         assert p.get_produced_properties().contains(passes.IRProperty.SSAForm)
-        assert p.get_produced_properties().contains(passes.IRProperty.SplitIncoreOrch)
+        assert p.get_produced_properties().contains(passes.IRProperty.HierarchyOutlined)
 
     def test_outline_cluster_requires_and_produces_ssa(self):
         """Test OutlineClusterScopes requires and produces SSAForm."""
diff --git a/tests/ut/ir/transforms/test_normalize_return_order.py b/tests/ut/ir/transforms/test_normalize_return_order.py
index 461cdada7..8d69bcd0b 100644
--- a/tests/ut/ir/transforms/test_normalize_return_order.py
+++ b/tests/ut/ir/transforms/test_normalize_return_order.py
@@ -378,7 +378,7 @@ def test_pass_name(self):
     def test_required_properties(self):
         p = passes.normalize_return_order()
         required = p.get_required_properties()
-        assert required.contains(passes.IRProperty.SplitIncoreOrch)
+        assert required.contains(passes.IRProperty.HierarchyOutlined)
         assert required.contains(passes.IRProperty.IncoreTileOps)
 
     def test_no_produced_properties(self):
diff --git a/tests/ut/ir/transforms/test_outline_hierarchy_scopes.py b/tests/ut/ir/transforms/test_outline_hierarchy_scopes.py
index b26ec402c..59e406495 100644
--- a/tests/ut/ir/transforms/test_outline_hierarchy_scopes.py
+++ b/tests/ut/ir/transforms/test_outline_hierarchy_scopes.py
@@ -147,8 +147,14 @@ def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
         After = passes.outline_hierarchy_scopes()(Before)
         ir.assert_structural_equal(After, Expected)
 
-    def test_outline_hierarchy_with_incore_preserved(self):
-        """Test that InCore scope inside Hierarchy scope is preserved (not outlined by this pass)."""
+    def test_outline_hierarchy_with_nested_core_group_preserves_core_group(self):
+        """CORE_GROUP scope nested inside HOST is preserved verbatim in the outlined function.
+
+        outline_hierarchy_scopes only outlines non-CORE_GROUP Hierarchy scopes.
+        After this pass, the HOST scope is outlined into a new Opaque function
+        whose body still contains the CORE_GROUP scope unchanged. The
+        CORE_GROUP scope is later outlined by outline_incore_scopes.
+        """
 
         @pl.program
         class Before:
@@ -159,23 +165,17 @@ def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
                         y: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
                 return y
 
-        @pl.program
-        class Expected:
-            @pl.function(level=pl.Level.HOST, role=pl.Role.Worker)
-            def main_host_worker_0(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP):
-                    y: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
-                return y
-
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                y: pl.Tensor[[64], pl.FP32] = self.main_host_worker_0(x)
-                return y
-
         Before = passes.convert_to_ssa()(Before)
-        Expected = passes.convert_to_ssa()(Expected)
         After = passes.outline_hierarchy_scopes()(Before)
-        ir.assert_structural_equal(After, Expected)
+        # The HOST scope was outlined into an Opaque function; main and the new
+        # function exist, no InCore function appears yet.
+        func_types = {gv.name: func.func_type for gv, func in After.functions.items()}
+        assert "main" in func_types
+        host_funcs = [n for n in func_types if "host_worker" in n]
+        assert len(host_funcs) == 1
+        assert func_types[host_funcs[0]] == ir.FunctionType.Opaque
+        # No CORE_GROUP InCore outlining happens in this pass
+        assert not any(t == ir.FunctionType.InCore for t in func_types.values())
 
     def test_outline_hierarchy_multiple_inputs(self):
         """Test outlining scope that uses multiple outer variables."""
@@ -292,8 +292,13 @@ def main(self, x: pl.Tensor[[64], pl.FP32], cond: pl.Scalar[pl.BOOL]) -> pl.Tens
         After = passes.outline_hierarchy_scopes()(Before)
         ir.assert_structural_equal(After, Expected)
 
-    def test_hierarchy_does_not_affect_incore_scopes(self):
-        """Test that OutlineHierarchyScopes does not outline InCore scopes."""
+    def test_hierarchy_preserves_core_group_scopes(self):
+        """CORE_GROUP hierarchy scopes are NOT outlined by outline_hierarchy_scopes.
+
+        OutlineHierarchyScopes is responsible for non-CORE_GROUP scopes only;
+        CORE_GROUP scopes survive intact and are outlined into InCore functions
+        by the subsequent OutlineIncoreScopes pass.
+        """
 
         @pl.program
         class Before:
@@ -305,7 +310,7 @@ def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
 
         Before = passes.convert_to_ssa()(Before)
         After = passes.outline_hierarchy_scopes()(Before)
-        # InCore scopes should remain untouched by the hierarchy pass
+        # Nothing was outlined; main keeps Opaque, no InCore function appears.
         ir.assert_structural_equal(After, Before)
 
     def test_hierarchy_does_not_affect_cluster_scopes(self):
@@ -499,38 +504,6 @@ def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
         Reparsed = pl.parse_program(printed)
         ir.assert_structural_equal(After, Reparsed)
 
-    def test_outline_then_incore(self):
-        """Test hierarchy outlined first, then InCore outlined from inside hierarchy function."""
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.HOST, role=pl.Role.Worker):
-                    with pl.at(level=pl.Level.CORE_GROUP):
-                        y: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
-                return y
-
-        Before = passes.convert_to_ssa()(Before)
-
-        # Step 1: Outline hierarchy scopes
-        After1 = passes.outline_hierarchy_scopes()(Before)
-
-        # The outlined hierarchy function should contain the InCore scope
-        hierarchy_func = After1.get_function("main_host_worker_0")
-        assert hierarchy_func is not None
-        assert hierarchy_func.level == ir.Level.HOST
-        printed1 = After1.as_python()
-        assert "pl.at(level=pl.Level.CORE_GROUP)" in printed1
-
-        # Step 2: Outline incore scopes (processes Opaque functions including hierarchy-outlined ones)
-        After2 = passes.outline_incore_scopes()(After1)
-
-        # The InCore scope should now be outlined from the hierarchy function
-        incore_func = After2.get_function("main_host_worker_0_incore_0")
-        assert incore_func is not None
-        assert incore_func.func_type == ir.FunctionType.InCore
-
     def test_outline_hierarchy_with_alias_level(self):
         """Test that level aliases (POD = CLUSTER_0) resolve to canonical name."""
 
@@ -553,7 +526,12 @@ def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
 
 
 class TestHierarchyOutlinedVerifier:
-    """Tests for the HierarchyOutlined property verifier."""
+    """Tests for the HierarchyOutlined property verifier.
+
+    HierarchyOutlined is jointly established by OutlineHierarchyScopes (handles
+    non-CORE_GROUP) and OutlineIncoreScopes (handles CORE_GROUP). Verification
+    only passes once both have run (or once both kinds of scopes are absent).
+    """
 
     @staticmethod
     def _hierarchy_outlined_props():
@@ -576,6 +554,7 @@ def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
         with ctx:
             program = passes.convert_to_ssa()(Input)
             program = passes.outline_hierarchy_scopes()(program)
+            program = passes.outline_incore_scopes()(program)
 
         # Should not throw — no Hierarchy scopes remain
         passes.verify_properties(self._hierarchy_outlined_props(), program, "test")
@@ -600,6 +579,26 @@ def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
         with pytest.raises(Exception, match="Hierarchy ScopeStmt"):
             passes.verify_properties(self._hierarchy_outlined_props(), program, "test")
 
+    def test_remaining_core_group_scope_fails_verification(self):
+        """A surviving CORE_GROUP scope (only OutlineHierarchyScopes ran) fails verification."""
+
+        @pl.program
+        class Input:
+            @pl.function
+            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
+                with pl.at(level=pl.Level.CORE_GROUP):
+                    y: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
+                return y
+
+        ctx = passes.PassContext([], passes.VerificationLevel.NONE)
+        with ctx:
+            program = passes.convert_to_ssa()(Input)
+            # outline_hierarchy_scopes alone leaves CORE_GROUP scopes intact
+            program = passes.outline_hierarchy_scopes()(program)
+
+        with pytest.raises(Exception, match="Hierarchy ScopeStmt"):
+            passes.verify_properties(self._hierarchy_outlined_props(), program, "test")
+
     def test_program_without_hierarchy_passes_verification(self):
         """Program that never had Hierarchy scopes passes verification."""
 
diff --git a/tests/ut/ir/transforms/test_outline_incore_interleaved_ops.py b/tests/ut/ir/transforms/test_outline_incore_interleaved_ops.py
deleted file mode 100644
index 1f0e0d8f9..000000000
--- a/tests/ut/ir/transforms/test_outline_incore_interleaved_ops.py
+++ /dev/null
@@ -1,567 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-
-"""Regression tests for non-parallel code inside auto_incore losing InCore scope.
-
-Root cause
-----------
-InterchangeChunkLoops consumes ``auto_incore`` and wraps each interchanged
-parallel chunk body in ``ScopeStmt(InCore)``.  However, non-parallel code
-(range loops, straight-line ops) that sits *between* parallel chunk loops
-inside the same ``auto_incore`` scope is left without an InCore wrapper.
-
-``WrapNonIncoreStatementsInInCore`` only operates on the direct children of
-the ``auto_incore`` body.  When the body is a single ``ForStmt`` (e.g. a
-``pl.range`` loop) whose body *contains* InCore scopes from the interchanged
-parallel chunks, ``ContainsInCoreScope`` returns ``True`` for the entire
-``ForStmt``, so the function returns it as-is — leaving non-parallel code
-inside the loop body unwrapped.
-
-Consequence: ``OutlineIncoreScopes`` cannot outline these unwrapped
-operations, so they stay in the Orchestration function as bare tensor ops
-(including matmul), which downstream passes (ConvertTensorToTileOps,
-ExpandMixedKernel, etc.) cannot process correctly.
-
-This reproduces the issue observed in the Qwen3SingleLayerDecode model where
-the MLP gate/up projection matmuls remained in the Orchestration function.
-"""
-
-import pypto.language as pl
-import pytest
-from pypto import ir, passes
-
-
-def _run_pipeline(program):
-    """Run prerequisite passes plus interchange + outline.
-
-    This is the full pipeline exercised by these tests: it reproduces the
-    setup that triggered the original bug (parallel chunks + non-parallel
-    code inside auto_incore).
-    """
-    program = passes.unroll_loops()(program)
-    program = passes.convert_to_ssa()(program)
-    program = passes.flatten_call_expr()(program)
-    program = passes.split_chunked_loops()(program)
-    program = passes.interchange_chunk_loops()(program)
-    program = passes.outline_incore_scopes()(program)
-    return program
-
-
-class TestNonParallelCodeBetweenChunks:
-    """Non-parallel code between parallel chunk loops inside auto_incore
-    must be wrapped in InCore scope so that OutlineIncoreScopes can outline it."""
-
-    def test_interleaved_scalar_op_gets_incore(self):
-        """A scalar op between two parallel chunks must get an InCore scope."""
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(
-                self,
-                x: pl.Tensor[[8, 64], pl.FP32],
-            ) -> pl.Tensor[[8, 64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for b in pl.range(0, 8, 4):
-                        for i in pl.parallel(4, chunk=2, chunk_policy="leading_full"):
-                            x = pl.tensor.adds(x, 1.0)
-                        y: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.muls(x, 2.0)
-                        for j in pl.parallel(4, chunk=2, chunk_policy="leading_full"):
-                            x = pl.tensor.add(x, y)
-                return x
-
-        @pl.program
-        class Expected:
-            @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN})
-            def main_incore_0(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]:
-                for i1, (x1,) in pl.parallel(
-                    2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                ):
-                    x2: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.adds(x1, 1.0)
-                    x3: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x2)
-                return x3
-
-            @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN})
-            def main_incore_1(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]:
-                y0: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.muls(x0, 2.0)
-                return y0
-
-            @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN})
-            def main_incore_2(
-                self,
-                x0: pl.Tensor[[8, 64], pl.FP32],
-                y0: pl.Tensor[[8, 64], pl.FP32],
-            ) -> pl.Tensor[[8, 64], pl.FP32]:
-                for j1, (x1,) in pl.parallel(
-                    2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                ):
-                    x2: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.add(x1, y0)
-                    x3: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x2)
-                return x3
-
-            @pl.function(type=pl.FunctionType.Orchestration)
-            def main(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]:
-                for b0, (x1,) in pl.range(0, 8, 4, init_values=(x0,)):
-                    for i0, (x2,) in pl.parallel(
-                        2, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                    ):
-                        x3: pl.Tensor[[8, 64], pl.FP32] = self.main_incore_0(x2)
-                        x4: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x3)
-                    y0: pl.Tensor[[8, 64], pl.FP32] = self.main_incore_1(x4)
-                    for j0, (x5,) in pl.parallel(
-                        2, init_values=(x4,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                    ):
-                        x6: pl.Tensor[[8, 64], pl.FP32] = self.main_incore_2(x5, y0)
-                        x7: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x6)
-                    x8: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x7)
-                return x8
-
-        After = _run_pipeline(Before)
-        ir.assert_structural_equal(After, Expected, enable_auto_mapping=True)
-
-    def test_interleaved_range_loop_gets_incore(self):
-        """A range loop between parallel chunks must get an InCore scope.
-
-        This mirrors the Qwen3 MLP pattern: a pl.range() loop containing
-        matmul sits between two pl.parallel() chunk loops.
-        """
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(
-                self,
-                x: pl.Tensor[[8, 64], pl.FP32],
-                w: pl.Tensor[[64, 64], pl.FP32],
-            ) -> pl.Tensor[[8, 64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for b in pl.range(0, 8, 4):
-                        for i in pl.parallel(4, chunk=2, chunk_policy="leading_full"):
-                            x = pl.tensor.adds(x, 1.0)
-                        for k in pl.range(2):
-                            x = pl.tensor.matmul(x, w)
-                        for j in pl.parallel(4, chunk=2, chunk_policy="leading_full"):
-                            x = pl.tensor.adds(x, 1.0)
-                return x
-
-        @pl.program
-        class Expected:
-            @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN})
-            def main_incore_0(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]:
-                for i1, (x1,) in pl.parallel(
-                    2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                ):
-                    x2: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.adds(x1, 1.0)
-                    x3: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x2)
-                return x3
-
-            @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN})
-            def main_incore_1(
-                self,
-                x0: pl.Tensor[[8, 64], pl.FP32],
-                w0: pl.Tensor[[64, 64], pl.FP32],
-            ) -> pl.Tensor[[8, 64], pl.FP32]:
-                for k0, (x1,) in pl.range(2, init_values=(x0,)):
-                    x2: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.matmul(
-                        x1, w0, a_trans=False, b_trans=False, c_matrix_nz=False
-                    )
-                    x3: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x2)
-                return x3
-
-            @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN})
-            def main_incore_2(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]:
-                for j1, (x1,) in pl.parallel(
-                    2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                ):
-                    x2: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.adds(x1, 1.0)
-                    x3: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x2)
-                return x3
-
-            @pl.function(type=pl.FunctionType.Orchestration)
-            def main(
-                self,
-                x0: pl.Tensor[[8, 64], pl.FP32],
-                w0: pl.Tensor[[64, 64], pl.FP32],
-            ) -> pl.Tensor[[8, 64], pl.FP32]:
-                for b0, (x1,) in pl.range(0, 8, 4, init_values=(x0,)):
-                    for i0, (x2,) in pl.parallel(
-                        2, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                    ):
-                        x3: pl.Tensor[[8, 64], pl.FP32] = self.main_incore_0(x2)
-                        x4: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x3)
-                    x5: pl.Tensor[[8, 64], pl.FP32] = self.main_incore_1(x4, w0)
-                    for j0, (x6,) in pl.parallel(
-                        2, init_values=(x5,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                    ):
-                        x7: pl.Tensor[[8, 64], pl.FP32] = self.main_incore_2(x6)
-                        x8: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x7)
-                    x9: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x8)
-                return x9
-
-        After = _run_pipeline(Before)
-        ir.assert_structural_equal(After, Expected, enable_auto_mapping=True)
-
-    def test_all_ops_outlined_end_to_end(self):
-        """End-to-end: all compute ops inside auto_incore must be outlined.
-
-        Same structure as ``test_interleaved_scalar_op_gets_incore`` — this
-        test is retained as a stronger end-to-end check (same expected output).
-        """
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(
-                self,
-                x: pl.Tensor[[8, 64], pl.FP32],
-            ) -> pl.Tensor[[8, 64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for b in pl.range(0, 8, 4):
-                        for i in pl.parallel(4, chunk=2, chunk_policy="leading_full"):
-                            x = pl.tensor.adds(x, 1.0)
-                        y: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.muls(x, 2.0)
-                        for j in pl.parallel(4, chunk=2, chunk_policy="leading_full"):
-                            x = pl.tensor.add(x, y)
-                return x
-
-        @pl.program
-        class Expected:
-            @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN})
-            def main_incore_0(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]:
-                for i1, (x1,) in pl.parallel(
-                    2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                ):
-                    x2: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.adds(x1, 1.0)
-                    x3: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x2)
-                return x3
-
-            @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN})
-            def main_incore_1(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]:
-                y0: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.muls(x0, 2.0)
-                return y0
-
-            @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN})
-            def main_incore_2(
-                self,
-                x0: pl.Tensor[[8, 64], pl.FP32],
-                y0: pl.Tensor[[8, 64], pl.FP32],
-            ) -> pl.Tensor[[8, 64], pl.FP32]:
-                for j1, (x1,) in pl.parallel(
-                    2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                ):
-                    x2: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.add(x1, y0)
-                    x3: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x2)
-                return x3
-
-            @pl.function(type=pl.FunctionType.Orchestration)
-            def main(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]:
-                for b0, (x1,) in pl.range(0, 8, 4, init_values=(x0,)):
-                    for i0, (x2,) in pl.parallel(
-                        2, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                    ):
-                        x3: pl.Tensor[[8, 64], pl.FP32] = self.main_incore_0(x2)
-                        x4: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x3)
-                    y0: pl.Tensor[[8, 64], pl.FP32] = self.main_incore_1(x4)
-                    for j0, (x5,) in pl.parallel(
-                        2, init_values=(x4,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                    ):
-                        x6: pl.Tensor[[8, 64], pl.FP32] = self.main_incore_2(x5, y0)
-                        x7: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x6)
-                    x8: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x7)
-                return x8
-
-        After = _run_pipeline(Before)
-        ir.assert_structural_equal(After, Expected, enable_auto_mapping=True)
-
-
-class TestNestedForStmtRecursion:
-    """The fix recurses into ForStmt bodies that contain InCore scopes.
-    These tests verify the recursion works for deeper nesting and edge cases."""
-
-    def test_doubly_nested_range_with_interleaved_op(self):
-        """Non-parallel op inside a doubly nested range loop must get InCore scope."""
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(
-                self,
-                x: pl.Tensor[[8, 64], pl.FP32],
-            ) -> pl.Tensor[[8, 64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for b in pl.range(0, 8, 4):
-                        for c in pl.range(2):
-                            for i in pl.parallel(4, chunk=2, chunk_policy="leading_full"):
-                                x = pl.tensor.adds(x, 1.0)
-                            y: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.muls(x, 3.0)
-                            for j in pl.parallel(4, chunk=2, chunk_policy="leading_full"):
-                                x = pl.tensor.add(x, y)
-                return x
-
-        @pl.program
-        class Expected:
-            @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN})
-            def main_incore_0(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]:
-                for i1, (x1,) in pl.parallel(
-                    2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                ):
-                    x2: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.adds(x1, 1.0)
-                    x3: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x2)
-                return x3
-
-            @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN})
-            def main_incore_1(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]:
-                y0: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.muls(x0, 3.0)
-                return y0
-
-            @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN})
-            def main_incore_2(
-                self,
-                x0: pl.Tensor[[8, 64], pl.FP32],
-                y0: pl.Tensor[[8, 64], pl.FP32],
-            ) -> pl.Tensor[[8, 64], pl.FP32]:
-                for j1, (x1,) in pl.parallel(
-                    2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                ):
-                    x2: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.add(x1, y0)
-                    x3: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x2)
-                return x3
-
-            @pl.function(type=pl.FunctionType.Orchestration)
-            def main(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]:
-                for b0, (x1,) in pl.range(0, 8, 4, init_values=(x0,)):
-                    for c0, (x2,) in pl.range(2, init_values=(x1,)):
-                        for i0, (x3,) in pl.parallel(
-                            2, init_values=(x2,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                        ):
-                            x4: pl.Tensor[[8, 64], pl.FP32] = self.main_incore_0(x3)
-                            x5: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x4)
-                        y0: pl.Tensor[[8, 64], pl.FP32] = self.main_incore_1(x5)
-                        for j0, (x6,) in pl.parallel(
-                            2, init_values=(x5,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                        ):
-                            x7: pl.Tensor[[8, 64], pl.FP32] = self.main_incore_2(x6, y0)
-                            x8: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x7)
-                        x9: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x8)
-                    x10: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x9)
-                return x10
-
-        After = _run_pipeline(Before)
-        ir.assert_structural_equal(After, Expected, enable_auto_mapping=True)
-
-    def test_single_forstmt_body_with_mixed_children(self):
-        """auto_incore body is a single ForStmt (not SeqStmts).
-
-        This is the exact trigger for the original bug: ContainsInCoreScope
-        returns True for the ForStmt, so the old code returned it as-is
-        without examining its children.
-        """
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(
-                self,
-                x: pl.Tensor[[8, 64], pl.FP32],
-            ) -> pl.Tensor[[8, 64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for b in pl.range(0, 8, 4):
-                        for i in pl.parallel(4, chunk=2, chunk_policy="leading_full"):
-                            x = pl.tensor.adds(x, 1.0)
-                        x = pl.tensor.muls(x, 2.0)
-                return x
-
-        @pl.program
-        class Expected:
-            @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN})
-            def main_incore_0(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]:
-                for i1, (x1,) in pl.parallel(
-                    2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                ):
-                    x2: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.adds(x1, 1.0)
-                    x3: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x2)
-                return x3
-
-            @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN})
-            def main_incore_1(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]:
-                x1: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.muls(x0, 2.0)
-                return x1
-
-            @pl.function(type=pl.FunctionType.Orchestration)
-            def main(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]:
-                for b0, (x1,) in pl.range(0, 8, 4, init_values=(x0,)):
-                    for i0, (x2,) in pl.parallel(
-                        2, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                    ):
-                        x3: pl.Tensor[[8, 64], pl.FP32] = self.main_incore_0(x2)
-                        x4: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x3)
-                    x5: pl.Tensor[[8, 64], pl.FP32] = self.main_incore_1(x4)
-                    x6: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x5)
-                return x6
-
-        After = _run_pipeline(Before)
-        ir.assert_structural_equal(After, Expected, enable_auto_mapping=True)
-
-    def test_multiple_non_parallel_ops_between_chunks(self):
-        """Multiple consecutive non-parallel ops between chunks must all be wrapped."""
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(
-                self,
-                x: pl.Tensor[[8, 64], pl.FP32],
-            ) -> pl.Tensor[[8, 64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for b in pl.range(0, 8, 4):
-                        for i in pl.parallel(4, chunk=2, chunk_policy="leading_full"):
-                            x = pl.tensor.adds(x, 1.0)
-                        y: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.muls(x, 2.0)
-                        z: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.add(x, y)
-                        x = pl.tensor.muls(z, 0.5)
-                        for j in pl.parallel(4, chunk=2, chunk_policy="leading_full"):
-                            x = pl.tensor.adds(x, 1.0)
-                return x
-
-        @pl.program
-        class Expected:
-            @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN})
-            def main_incore_0(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]:
-                for i1, (x1,) in pl.parallel(
-                    2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                ):
-                    x2: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.adds(x1, 1.0)
-                    x3: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x2)
-                return x3
-
-            @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN})
-            def main_incore_1(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]:
-                y0: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.muls(x0, 2.0)
-                z0: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.add(x0, y0)
-                x1: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.muls(z0, 0.5)
-                return x1
-
-            @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN})
-            def main_incore_2(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]:
-                for j1, (x1,) in pl.parallel(
-                    2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                ):
-                    x2: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.adds(x1, 1.0)
-                    x3: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x2)
-                return x3
-
-            @pl.function(type=pl.FunctionType.Orchestration)
-            def main(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]:
-                for b0, (x1,) in pl.range(0, 8, 4, init_values=(x0,)):
-                    for i0, (x2,) in pl.parallel(
-                        2, init_values=(x1,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                    ):
-                        x3: pl.Tensor[[8, 64], pl.FP32] = self.main_incore_0(x2)
-                        x4: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x3)
-                    x5: pl.Tensor[[8, 64], pl.FP32] = self.main_incore_1(x4)
-                    for j0, (x6,) in pl.parallel(
-                        2, init_values=(x5,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                    ):
-                        x7: pl.Tensor[[8, 64], pl.FP32] = self.main_incore_2(x6)
-                        x8: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x7)
-                    x9: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x8)
-                return x9
-
-        After = _run_pipeline(Before)
-        ir.assert_structural_equal(After, Expected, enable_auto_mapping=True)
-
-    def test_no_parallel_chunks_no_wrapping(self):
-        """auto_incore with only non-parallel code (no chunks) should not crash.
-
-        When there are no interchanged parallel chunks, there are no InCore
-        scopes to trigger recursion. The function should still work correctly —
-        the whole body becomes a single InCore function.
-        """
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(
-                self,
-                x: pl.Tensor[[8, 64], pl.FP32],
-            ) -> pl.Tensor[[8, 64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for b in pl.range(0, 8, 4):
-                        x = pl.tensor.adds(x, 1.0)
-                        x = pl.tensor.muls(x, 2.0)
-                return x
-
-        @pl.program
-        class Expected:
-            @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN})
-            def main_incore_0(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]:
-                for b0, (x1,) in pl.range(0, 8, 4, init_values=(x0,)):
-                    x2: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.adds(x1, 1.0)
-                    x3: pl.Tensor[[8, 64], pl.FP32] = pl.tensor.muls(x2, 2.0)
-                    x4: pl.Tensor[[8, 64], pl.FP32] = pl.yield_(x3)
-                return x4
-
-            @pl.function(type=pl.FunctionType.Orchestration)
-            def main(self, x0: pl.Tensor[[8, 64], pl.FP32]) -> pl.Tensor[[8, 64], pl.FP32]:
-                x1: pl.Tensor[[8, 64], pl.FP32] = self.main_incore_0(x0)
-                return x1
-
-        After = _run_pipeline(Before)
-        ir.assert_structural_equal(After, Expected, enable_auto_mapping=True)
-
-
-class TestHostSideTailOps:
-    """Host-side tensor ops may stay in Orchestration after outline."""
-
-    def test_tail_assemble_after_parallel_chunk_stays_in_orchestration(self):
-        """A trailing tensor.assemble should remain in the Orchestration function."""
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(self, x: pl.Tensor[[4], pl.FP32]) -> pl.Tensor[[8], pl.FP32]:
-                out_0: pl.Tensor[[8], pl.FP32] = pl.tensor.create(
-                    [8], dtype=pl.FP32, layout=pl.TensorLayout.ND
-                )
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i in pl.parallel(0, 4, 1, chunk=2, chunk_policy="leading_full"):
-                        x = pl.tensor.adds(x, 1.0)
-                    out_1: pl.Tensor[[8], pl.FP32] = pl.tensor.assemble(out_0, x, [0])
-                return out_1
-
-        @pl.program
-        class Expected:
-            @pl.function(type=pl.FunctionType.InCore, attrs={"split": pl.SplitMode.UP_DOWN})
-            def main_incore_0(self, x0: pl.Tensor[[4], pl.FP32]) -> pl.Tensor[[4], pl.FP32]:
-                for i1, (x1,) in pl.parallel(
-                    2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                ):
-                    x2: pl.Tensor[[4], pl.FP32] = pl.tensor.adds(x1, 1.0)
-                    x3: pl.Tensor[[4], pl.FP32] = pl.yield_(x2)
-                return x3
-
-            @pl.function(type=pl.FunctionType.Orchestration)
-            def main(self, x0: pl.Tensor[[4], pl.FP32]) -> pl.Tensor[[8], pl.FP32]:
-                out_0: pl.Tensor[[8], pl.FP32] = pl.tensor.create(
-                    [8], dtype=pl.FP32, layout=pl.TensorLayout.ND
-                )
-                for i0, (x1,) in pl.parallel(
-                    2, init_values=(x0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                ):
-                    x2: pl.Tensor[[4], pl.FP32] = self.main_incore_0(x1)
-                    x3: pl.Tensor[[4], pl.FP32] = pl.yield_(x2)
-                out_1: pl.Tensor[[8], pl.FP32] = pl.tensor.assemble(out_0, x3, [0])
-                return out_1
-
-        After = _run_pipeline(Before)
-        ir.assert_structural_equal(After, Expected, enable_auto_mapping=True)
-
-
-if __name__ == "__main__":
-    pytest.main([__file__, "-v"])
diff --git a/tests/ut/ir/transforms/test_outline_incore_scopes.py b/tests/ut/ir/transforms/test_outline_incore_scopes.py
index a8015421b..a000f9e1f 100644
--- a/tests/ut/ir/transforms/test_outline_incore_scopes.py
+++ b/tests/ut/ir/transforms/test_outline_incore_scopes.py
@@ -7,21 +7,24 @@
 # See LICENSE in the root of the software repository for the full text of the License.
 # -----------------------------------------------------------------------------------------------------------
 
-"""Unit tests for OutlineIncoreScopes pass."""
+"""Unit tests for OutlineIncoreScopes pass.
 
-import re
+OutlineIncoreScopes outlines `HierarchyScopeStmt(level=CORE_GROUP)` into
+`Function(InCore)` and promotes the parent function from `Opaque` to
+`Orchestration`. It runs after OutlineHierarchyScopes, which handles all
+non-CORE_GROUP Hierarchy scopes.
+"""
 
 import pypto.language as pl
 import pytest
 from pypto import ir, passes
-from pypto.ir.printer import python_print
 
 
 class TestOutlineIncoreScopes:
     """Test OutlineIncoreScopes pass."""
 
-    def test_outline_simple_incore_scope(self):
-        """Test outlining a simple InCore scope."""
+    def test_outline_simple_core_group_scope(self):
+        """A single CORE_GROUP scope becomes an InCore function; main is promoted to Orchestration."""
 
         @pl.program
         class Before:
@@ -31,782 +34,156 @@ def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
                     y: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
                 return y
 
-        @pl.program
-        class Expected:
-            @pl.function(type=pl.FunctionType.InCore)
-            def main_incore_0(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                y: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
-                return y
-
-            @pl.function(type=pl.FunctionType.Orchestration)
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                y: pl.Tensor[[64], pl.FP32] = self.main_incore_0(x)
-                return y
-
-        # Convert to SSA first (required by outline pass)
         Before = passes.convert_to_ssa()(Before)
-        Expected = passes.convert_to_ssa()(Expected)
-
-        # Apply outline pass
         After = passes.outline_incore_scopes()(Before)
 
-        # Should be structurally equal
-        ir.assert_structural_equal(After, Expected)
-
-    def test_outline_multiple_incore_scopes(self):
-        """Test outlining multiple InCore scopes in one function."""
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP):
-                    y: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
-                with pl.at(level=pl.Level.CORE_GROUP):
-                    z: pl.Tensor[[64], pl.FP32] = pl.mul(y, y)
-                return z
-
-        @pl.program
-        class Expected:
-            @pl.function(type=pl.FunctionType.InCore)
-            def main_incore_0(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                y: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
-                return y
-
-            @pl.function(type=pl.FunctionType.InCore)
-            def main_incore_1(self, y: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                z: pl.Tensor[[64], pl.FP32] = pl.mul(y, y)
-                return z
-
-            @pl.function(type=pl.FunctionType.Orchestration)
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                y: pl.Tensor[[64], pl.FP32] = self.main_incore_0(x)
-                z: pl.Tensor[[64], pl.FP32] = self.main_incore_1(y)
-                return z
-
-        # Convert to SSA first
-        Before = passes.convert_to_ssa()(Before)
-        Expected = passes.convert_to_ssa()(Expected)
-
-        # Apply outline pass
-        After = passes.outline_incore_scopes()(Before)
+        func_types = {gv.name: func.func_type for gv, func in After.functions.items()}
+        # Parent promoted
+        assert func_types["main"] == ir.FunctionType.Orchestration
+        # Exactly one outlined InCore function with "core_group" in its name
+        incore_funcs = [(n, t) for n, t in func_types.items() if t == ir.FunctionType.InCore]
+        assert len(incore_funcs) == 1
+        assert "core_group" in incore_funcs[0][0]
 
-        # Should be structurally equal
-        ir.assert_structural_equal(After, Expected)
+    def test_outline_preserves_non_core_group_scopes(self):
+        """Non-CORE_GROUP Hierarchy scopes are left intact for OutlineHierarchyScopes.
 
-    def test_outline_preserves_non_incore_functions(self):
-        """Test that non-InCore functions are preserved unchanged."""
+        Run with verification disabled because OutlineIncoreScopes claims to
+        produce HierarchyOutlined; a leftover HOST scope (which would normally
+        have been removed by OutlineHierarchyScopes earlier in the pipeline)
+        intentionally fails that property — we only care that the pass itself
+        is a no-op for non-CORE_GROUP scopes.
+        """
 
         @pl.program
         class Before:
             @pl.function
-            def helper(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                result: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
-                return result
-
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP):
-                    y: pl.Tensor[[64], pl.FP32] = pl.mul(x, x)
-                return y
-
-        @pl.program
-        class Expected:
-            @pl.function
-            def helper(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                result: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
-                return result
-
-            @pl.function(type=pl.FunctionType.InCore)
-            def main_incore_0(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                y: pl.Tensor[[64], pl.FP32] = pl.mul(x, x)
-                return y
-
-            @pl.function(type=pl.FunctionType.Orchestration)
             def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                y: pl.Tensor[[64], pl.FP32] = self.main_incore_0(x)
+                with pl.at(level=pl.Level.HOST, role=pl.Role.Worker):
+                    y: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
                 return y
 
-        # Convert to SSA first
-        Before = passes.convert_to_ssa()(Before)
-        Expected = passes.convert_to_ssa()(Expected)
-
-        # Apply outline pass
-        After = passes.outline_incore_scopes()(Before)
+        with passes.PassContext([], passes.VerificationLevel.NONE):
+            Before = passes.convert_to_ssa()(Before)
+            After = passes.outline_incore_scopes()(Before)
+        # Pass is a no-op — no CORE_GROUP scope present.
+        ir.assert_structural_equal(After, Before)
 
-        # Should be structurally equal
-        ir.assert_structural_equal(After, Expected)
-
-    def test_outline_scope_with_multiple_inputs(self):
-        """Test outlining scope that uses multiple outer variables."""
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(
-                self, x: pl.Tensor[[64], pl.FP32], y: pl.Tensor[[64], pl.FP32]
-            ) -> pl.Tensor[[64], pl.FP32]:
-                a: pl.Tensor[[64], pl.FP32] = pl.add(x, y)
-                b: pl.Tensor[[64], pl.FP32] = pl.mul(x, y)
-                with pl.at(level=pl.Level.CORE_GROUP):
-                    result: pl.Tensor[[64], pl.FP32] = pl.add(a, b)
-                return result
-
-        @pl.program
-        class Expected:
-            @pl.function(type=pl.FunctionType.InCore)
-            def main_incore_0(
-                self, a: pl.Tensor[[64], pl.FP32], b: pl.Tensor[[64], pl.FP32]
-            ) -> pl.Tensor[[64], pl.FP32]:
-                result: pl.Tensor[[64], pl.FP32] = pl.add(a, b)
-                return result
-
-            @pl.function(type=pl.FunctionType.Orchestration)
-            def main(
-                self, x: pl.Tensor[[64], pl.FP32], y: pl.Tensor[[64], pl.FP32]
-            ) -> pl.Tensor[[64], pl.FP32]:
-                a: pl.Tensor[[64], pl.FP32] = pl.add(x, y)
-                b: pl.Tensor[[64], pl.FP32] = pl.mul(x, y)
-                result: pl.Tensor[[64], pl.FP32] = self.main_incore_0(a, b)
-                return result
-
-        Before = passes.convert_to_ssa()(Before)
-        Expected = passes.convert_to_ssa()(Expected)
-        After = passes.outline_incore_scopes()(Before)
-        ir.assert_structural_equal(After, Expected)
-
-    def test_outline_scope_with_multiple_outputs(self):
-        """Test outlining scope that produces multiple values.
-
-        The Before/After pattern can't express TupleGetItem in the DSL,
-        so we verify properties directly.
-        """
+    def test_outline_split_propagates_to_incore_function(self):
+        """`pl.split(...)` on a CORE_GROUP scope is forwarded to the outlined InCore fn."""
 
         @pl.program
         class Before:
             @pl.function
             def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP):
+                with pl.at(
+                    level=pl.Level.CORE_GROUP,
+                    optimizations=[pl.split(pl.SplitMode.UP_DOWN)],
+                ):
                     y: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
-                    z: pl.Tensor[[64], pl.FP32] = pl.mul(x, x)
-                result: pl.Tensor[[64], pl.FP32] = pl.add(y, z)
-                return result
-
-        @pl.program
-        class Expected:
-            @pl.function(type=pl.FunctionType.InCore)
-            def main_incore_0(
-                self, x: pl.Tensor[[64], pl.FP32]
-            ) -> tuple[pl.Tensor[[64], pl.FP32], pl.Tensor[[64], pl.FP32]]:
-                y: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
-                z: pl.Tensor[[64], pl.FP32] = pl.mul(x, x)
-                return (y, z)
-
-            @pl.function(type=pl.FunctionType.Orchestration)
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                ret = self.main_incore_0(x)
-                y = ret[0]
-                z = ret[1]
-                result: pl.Tensor[[64], pl.FP32] = pl.add(y, z)
-                return result
+                return y
 
         Before = passes.convert_to_ssa()(Before)
         After = passes.outline_incore_scopes()(Before)
+        incore_funcs = [f for _, f in After.functions.items() if f.func_type == ir.FunctionType.InCore]
+        assert len(incore_funcs) == 1
+        # `split` attr round-trips as the SplitMode's underlying int value.
+        attrs = dict(incore_funcs[0].attrs)
+        assert attrs.get("split") == ir.SplitMode.UP_DOWN.value
 
-        ir.assert_structural_equal(After, Expected)
-
-    def test_nested_incore_scopes_rejected_by_verifier(self):
-        """Nested InCore scopes are rejected by the NoNestedInCore structural verifier."""
+    def test_pipeline_order_outlines_nested_core_group(self):
+        """Hierarchy then Incore outlining cleanly handles a CORE_GROUP nested inside HOST."""
 
         @pl.program
         class Before:
             @pl.function
             def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP):
-                    y: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
+                with pl.at(level=pl.Level.HOST, role=pl.Role.Worker):
                     with pl.at(level=pl.Level.CORE_GROUP):
-                        z: pl.Tensor[[64], pl.FP32] = pl.mul(y, y)
-                return z
-
-        # Verify directly (no pass pipeline) — nested InCore is a structural invariant violation
-        props = passes.IRPropertySet()
-        props.insert(passes.IRProperty.NoNestedInCore)
-        diagnostics = passes.PropertyVerifierRegistry.verify(props, Before)
-        errors = [d for d in diagnostics if d.severity == passes.DiagnosticSeverity.Error]
-        assert len(errors) >= 1
-        assert "Nested InCore scope" in errors[0].message
-
-    def test_outline_scope_with_single_input_single_output(self):
-        """Test outlining scope with simple single input/output."""
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                a: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
-                with pl.at(level=pl.Level.CORE_GROUP):
-                    y: pl.Tensor[[64], pl.FP32] = pl.mul(a, a)
-                result: pl.Tensor[[64], pl.FP32] = pl.add(y, y)
-                return result
-
-        @pl.program
-        class Expected:
-            @pl.function(type=pl.FunctionType.InCore)
-            def main_incore_0(self, a: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                y: pl.Tensor[[64], pl.FP32] = pl.mul(a, a)
+                        y: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
                 return y
 
-            @pl.function(type=pl.FunctionType.Orchestration)
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                a: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
-                y: pl.Tensor[[64], pl.FP32] = self.main_incore_0(a)
-                result: pl.Tensor[[64], pl.FP32] = pl.add(y, y)
-                return result
+        program = passes.convert_to_ssa()(Before)
+        program = passes.outline_hierarchy_scopes()(program)
+        program = passes.outline_incore_scopes()(program)
 
-        Before = passes.convert_to_ssa()(Before)
-        Expected = passes.convert_to_ssa()(Expected)
-        After = passes.outline_incore_scopes()(Before)
-        ir.assert_structural_equal(After, Expected)
+        func_types = {gv.name: func.func_type for gv, func in program.functions.items()}
+        # The inner HOST function (which originally wrapped the CORE_GROUP scope)
+        # must have been promoted to Orchestration when its CORE_GROUP child got
+        # outlined. Distinguish it from the further-outlined CORE_GROUP function
+        # (whose name extends `main_host_worker_…` with `_core_group_…`) by
+        # filtering out names that *also* contain `core_group`.
+        host_only_funcs = [n for n in func_types if "host_worker" in n and "core_group" not in n]
+        assert len(host_only_funcs) == 1
+        assert func_types[host_only_funcs[0]] == ir.FunctionType.Orchestration
+        # An InCore function exists.
+        assert any(t == ir.FunctionType.InCore for t in func_types.values())
+        # main itself (which only contained the HOST scope, not a CORE_GROUP
+        # directly) stays Opaque.
+        assert func_types["main"] == ir.FunctionType.Opaque
 
-    def test_outline_multiple_functions_with_scopes(self):
-        """Test outlining scopes in multiple functions (independent numbering)."""
+    def test_no_core_group_passthrough(self):
+        """Functions without CORE_GROUP scopes pass through unchanged."""
 
         @pl.program
         class Before:
             @pl.function
-            def func1(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP):
-                    y: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
-                return y
-
-            @pl.function
-            def func2(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP):
-                    y: pl.Tensor[[64], pl.FP32] = pl.mul(x, x)
-                return y
-
-        @pl.program
-        class Expected:
-            @pl.function(type=pl.FunctionType.InCore)
-            def func1_incore_0(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
+            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
                 y: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
                 return y
 
-            @pl.function(type=pl.FunctionType.Orchestration)
-            def func1(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                y: pl.Tensor[[64], pl.FP32] = self.func1_incore_0(x)
-                return y
-
-            @pl.function(type=pl.FunctionType.InCore)
-            def func2_incore_0(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                y: pl.Tensor[[64], pl.FP32] = pl.mul(x, x)
-                return y
-
-            @pl.function(type=pl.FunctionType.Orchestration)
-            def func2(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                y: pl.Tensor[[64], pl.FP32] = self.func2_incore_0(x)
-                return y
-
         Before = passes.convert_to_ssa()(Before)
-        Expected = passes.convert_to_ssa()(Expected)
         After = passes.outline_incore_scopes()(Before)
-        ir.assert_structural_equal(After, Expected)
+        ir.assert_structural_equal(After, Before)
 
-    def test_outline_scope_in_control_flow(self):
-        """Test outlining scope inside conditional statement."""
+    def test_outline_skips_non_opaque_functions(self):
+        """Already-typed (InCore/Orchestration/...) functions are not touched."""
 
         @pl.program
         class Before:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32], cond: pl.Scalar[pl.BOOL]) -> pl.Tensor[[64], pl.FP32]:
-                if cond:
-                    with pl.at(level=pl.Level.CORE_GROUP):
-                        y: pl.Tensor[[64], pl.FP32] = pl.add(x, x)  # type: ignore[no-redef]
-                else:
-                    y: pl.Tensor[[64], pl.FP32] = pl.mul(x, x)  # type: ignore[no-redef,unreachable]
-                return y
-
-        @pl.program
-        class Expected:
             @pl.function(type=pl.FunctionType.InCore)
-            def main_incore_0(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
+            def compute(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
                 y: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
                 return y
 
-            @pl.function(type=pl.FunctionType.Orchestration)
-            def main(self, x: pl.Tensor[[64], pl.FP32], cond: pl.Scalar[pl.BOOL]) -> pl.Tensor[[64], pl.FP32]:
-                if cond:
-                    y: pl.Tensor[[64], pl.FP32] = self.main_incore_0(x)  # type: ignore[no-redef]
-                else:
-                    y: pl.Tensor[[64], pl.FP32] = pl.mul(x, x)  # type: ignore[no-redef,unreachable]
-                return y
-
-        Before = passes.convert_to_ssa()(Before)
-        Expected = passes.convert_to_ssa()(Expected)
-        After = passes.outline_incore_scopes()(Before)
-        ir.assert_structural_equal(After, Expected)
-
-    def test_outline_incore_with_if_yield(self):
-        """Test outline_incore_scopes with IfStmt containing unannotated yields (issue #233)."""
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32], cond: pl.Scalar[pl.BOOL]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP):
-                    if cond:
-                        y: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
-                        z = pl.yield_(y)  # Unannotated - should infer type
-                    else:
-                        y2: pl.Tensor[[64], pl.FP32] = pl.mul(x, x)
-                        z = pl.yield_(y2)
-                return z
-
-        Before = passes.convert_to_ssa()(Before)
-        After = passes.outline_incore_scopes()(Before)
-
-        printed = After.as_python()
-        # The outlined incore function should have correct return type, not Tensor[[1], INT32]
-        assert "Tensor[[1], pl.INT32]" not in printed
-        assert "Tensor[[64], pl.FP32]" in printed
-
-    def test_outline_scope_with_intermediate_computation(self):
-        """Test outlining scope with computation before, inside, and after."""
-
-        @pl.program
-        class Before:
             @pl.function
             def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                a: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
-                b: pl.Tensor[[64], pl.FP32] = pl.mul(a, a)
-                with pl.at(level=pl.Level.CORE_GROUP):
-                    c: pl.Tensor[[64], pl.FP32] = pl.add(b, b)
-                    d: pl.Tensor[[64], pl.FP32] = pl.mul(c, c)
-                e: pl.Tensor[[64], pl.FP32] = pl.add(d, d)
-                return e
-
-        @pl.program
-        class Expected:
-            @pl.function(type=pl.FunctionType.InCore)
-            def main_incore_0(self, b: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                c: pl.Tensor[[64], pl.FP32] = pl.add(b, b)
-                d: pl.Tensor[[64], pl.FP32] = pl.mul(c, c)
-                return d
-
-            @pl.function(type=pl.FunctionType.Orchestration)
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                a: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
-                b: pl.Tensor[[64], pl.FP32] = pl.mul(a, a)
-                d: pl.Tensor[[64], pl.FP32] = self.main_incore_0(b)
-                e: pl.Tensor[[64], pl.FP32] = pl.add(d, d)
-                return e
-
-        Before = passes.convert_to_ssa()(Before)
-        Expected = passes.convert_to_ssa()(Expected)
-        After = passes.outline_incore_scopes()(Before)
-        ir.assert_structural_equal(After, Expected)
-
-    def test_outline_scope_with_store_only_outputs(self):
-        """Test outlining scope where the only outputs are store targets.
-
-        When an InCore scope only writes to external tensors via tile.store
-        (no new variable definitions used after the scope), the store targets
-        must be recognised as outputs and returned.
-        """
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(self, x: pl.Tensor[[16, 128], pl.FP32]) -> pl.Tensor[[16, 128], pl.FP32]:
-                buf: pl.Tensor[[16, 128], pl.FP32] = pl.create_tensor([16, 128], dtype=pl.FP32)
                 with pl.at(level=pl.Level.CORE_GROUP):
-                    tile = pl.tile.full([16, 128], dtype=pl.FP32, value=0.0)
-                    pl.store(tile, [0, 0], buf)
-                result: pl.Tensor[[16, 128], pl.FP32] = pl.add(buf, x)
-                return result
-
-        Before = passes.convert_to_ssa()(Before)
-        After = passes.outline_incore_scopes()(Before)
-
-        printed = After.as_python()
-        # The outlined InCore function should return buf (store target)
-        assert "return buf" in printed or "return buf_0" in printed
-        # The orchestration should receive the return value
-        assert "main_incore_0(" in printed
-
-    def test_outline_scope_with_multiple_store_targets(self):
-        """Test outlining scope with multiple store targets as outputs.
-
-        Multiple external tensors modified via tile.store should all appear
-        as return values of the outlined function.
-        """
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(self, x: pl.Tensor[[16, 128], pl.FP32]) -> pl.Tensor[[16, 128], pl.FP32]:
-                buf_a: pl.Tensor[[16, 128], pl.FP32] = pl.create_tensor([16, 128], dtype=pl.FP32)
-                buf_b: pl.Tensor[[16, 1], pl.FP32] = pl.create_tensor([16, 1], dtype=pl.FP32)
-                with pl.at(level=pl.Level.CORE_GROUP):
-                    tile_a = pl.tile.full([16, 128], dtype=pl.FP32, value=0.0)
-                    tile_b = pl.tile.full([16, 1], dtype=pl.FP32, value=0.0)
-                    pl.store(tile_a, [0, 0], buf_a)
-                    pl.store(tile_b, [0, 0], buf_b)
-                result: pl.Tensor[[16, 128], pl.FP32] = pl.add(buf_a, x)
-                return result
+                    y: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
+                return y
 
         Before = passes.convert_to_ssa()(Before)
         After = passes.outline_incore_scopes()(Before)
 
-        printed = After.as_python()
-        # Both store targets should appear as outputs
-        assert "main_incore_0(" in printed
-        # The InCore function should have return statement
-        assert (
-            "return" in printed.split("@pl.function(type=pl.FunctionType.InCore)")[1].split("@pl.function")[0]
-        )
+        compute = After.get_function("compute")
+        assert compute is not None
+        assert compute.func_type == ir.FunctionType.InCore
 
-    def test_outline_scope_with_loop_carried_init_values(self):
-        """Test outlining scope where inner loop references outer loop-carried variable via init_values.
+        main = After.get_function("main")
+        assert main is not None
+        # main got promoted because its CORE_GROUP scope was outlined.
+        assert main.func_type == ir.FunctionType.Orchestration
 
-        Regression test for issue #369: OutlineIncoreScopes failed to include
-        outer loop-carried variables as incore function parameters when they
-        appeared only inside IterArg.initValue_ expressions.
-        """
+    def test_multiple_core_group_scopes_in_one_function(self):
+        """Two sibling CORE_GROUP scopes both get outlined; parent promoted once."""
 
         @pl.program
         class Before:
-            @pl.function
-            def main(
-                self, x: pl.Tensor[[64], pl.FP32], y: pl.Tensor[[64], pl.FP32]
-            ) -> pl.Tensor[[64], pl.FP32]:
-                for i, (acc,) in pl.range(3, init_values=(x,)):
-                    with pl.at(level=pl.Level.CORE_GROUP):
-                        for j, (inner,) in pl.range(2, init_values=(acc,)):
-                            updated: pl.Tensor[[64], pl.FP32] = pl.add(inner, y)
-                            inner_rv = pl.yield_(updated)
-                    acc_rv = pl.yield_(inner_rv)
-                return acc_rv
-
-        Before = passes.convert_to_ssa()(Before)
-        After = passes.outline_incore_scopes()(Before)
-
-        printed = After.as_python()
-        incore_section = printed.split("@pl.function(type=pl.FunctionType.InCore)")[1].split("@pl.function")[
-            0
-        ]
-        # Extract parameters between "def ...(self, ...)" — handle multiline signatures
-        param_match = re.search(r"def \w+\((.*?)\)\s*->", incore_section, re.DOTALL)
-        assert param_match is not None
-        incore_params = param_match.group(1)
-        orch_section = printed.split("@pl.function(type=pl.FunctionType.Orchestration)")[1]
-
-        assert "acc" in incore_params, (
-            "outer loop-carried variable 'acc' must be a parameter of the outlined function"
-        )
-        assert "main_incore_0" in orch_section and "acc" in orch_section, (
-            "orchestration must pass 'acc' to the outlined function"
-        )
-
-    def test_outline_scope_does_not_capture_outer_init_value(self):
-        """Outer loop's init value must NOT become a parameter of the outlined incore function.
-
-        When an incore scope uses a loop-carried variable (IterArg) from an
-        outer ForStmt, only the IterArg itself should be captured as a
-        parameter, not its initValue_ expression.
-        """
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(
-                self, init: pl.Tensor[[64], pl.FP32], y: pl.Tensor[[64], pl.FP32]
-            ) -> pl.Tensor[[64], pl.FP32]:
-                for sb, (acc,) in pl.range(4, init_values=(init,)):
-                    with pl.at(level=pl.Level.CORE_GROUP):
-                        result: pl.Tensor[[64], pl.FP32] = pl.add(acc, y)
-                    acc_rv = pl.yield_(result)
-                return acc_rv
-
-        Before = passes.convert_to_ssa()(Before)
-        After = passes.outline_incore_scopes()(Before)
-
-        printed = After.as_python()
-        incore_section = printed.split("@pl.function(type=pl.FunctionType.InCore)")[1].split("@pl.function")[
-            0
-        ]
-        # Extract parameters — handle multiline signatures from ruff formatting
-        param_match = re.search(r"def \w+\((.*?)\)\s*->", incore_section, re.DOTALL)
-        assert param_match is not None
-        incore_params = param_match.group(1)
-
-        assert "acc" in incore_params, "loop-carried 'acc' must be a parameter"
-        assert "init" not in incore_params, (
-            "outer loop's init value 'init' must NOT be a parameter of the incore function"
-        )
-
-
-class TestSplitIncoreOrchVerifier:
-    """Regression tests for the SplitIncoreOrch property verifier."""
-
-    def _build_outlined_program(self, input_program):
-        """Run convert_to_ssa + outline_incore_scopes (no verification)."""
-        ctx = passes.PassContext([], passes.VerificationLevel.NONE)
-        with ctx:
-            program = passes.convert_to_ssa()(input_program)
-            program = passes.outline_incore_scopes()(program)
-        return program
-
-    @staticmethod
-    def _split_incore_orch_props():
-        ps = passes.IRPropertySet()
-        ps.insert(passes.IRProperty.SplitIncoreOrch)
-        return ps
-
-    def test_clean_orchestration_passes_verification(self):
-        """Outlined program with all compute in InCore passes property verification."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP):
-                    y: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
-                return y
-
-        After = self._build_outlined_program(Input)
-        # Should not throw — no InCore scopes remain, no errors
-        passes.verify_properties(self._split_incore_orch_props(), After, "test")
-
-    def test_remaining_incore_scope_fails_verification(self):
-        """Leftover InCore ScopeStmt in non-InCore function causes verification failure."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP):
-                    y: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
-                return y
-
-        # Don't outline — just convert to SSA, leaving InCore scope intact
-        ctx = passes.PassContext([], passes.VerificationLevel.NONE)
-        with ctx:
-            program = passes.convert_to_ssa()(Input)
-
-        # verify_properties should throw because InCore scope remains in Opaque function
-        with pytest.raises(Exception, match="InCore ScopeStmt"):
-            passes.verify_properties(self._split_incore_orch_props(), program, "test")
-
-    def test_compute_op_in_orchestration_does_not_fail(self):
-        """Compute tensor op in Orchestration produces warning (not error), verification passes."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                a: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
-                with pl.at(level=pl.Level.CORE_GROUP):
-                    y: pl.Tensor[[64], pl.FP32] = pl.mul(a, a)
-                return y
-
-        After = self._build_outlined_program(Input)
-        # Orchestration has tensor.add — but it's a warning, not an error
-        # verify_properties should NOT throw
-        passes.verify_properties(self._split_incore_orch_props(), After, "test")
-
-    def test_outline_does_not_throw_for_clean_program(self):
-        """Running outline_incore_scopes on a clean program does not throw."""
-
-        @pl.program
-        class Input:
             @pl.function
             def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
                 with pl.at(level=pl.Level.CORE_GROUP):
                     y: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
-                return y
-
-        # Run with full verification enabled — should not throw
-        program = passes.convert_to_ssa()(Input)
-        passes.outline_incore_scopes()(program)
-
-    def test_outline_with_compute_outside_incore_verification_passes(self):
-        """Compute ops outside incore in explicit pl.incore() usage: verification passes (warning only)."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                a: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
                 with pl.at(level=pl.Level.CORE_GROUP):
-                    y: pl.Tensor[[64], pl.FP32] = pl.mul(a, a)
-                result: pl.Tensor[[64], pl.FP32] = pl.add(y, y)
-                return result
-
-        # Run with full verification — should pass despite compute ops in orchestration
-        program = passes.convert_to_ssa()(Input)
-        After = passes.outline_incore_scopes()(program)
-
-        # Verify the outlined program still has the expected structure
-        orch_funcs = [f for f in After.functions.values() if f.func_type == ir.FunctionType.Orchestration]
-        incore_funcs = [f for f in After.functions.values() if f.func_type == ir.FunctionType.InCore]
-        assert len(orch_funcs) == 1
-        assert len(incore_funcs) == 1
-
-    def test_full_pipeline_with_verification_passes(self):
-        """Full pipeline with auto_incore: no compute ops leak into Orchestration."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    x = pl.add(x, 1.0)
-                    for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                        x = pl.add(x, 2.0)
-                return x
-
-        # Run the full pipeline with verification enabled — should not throw
-        program = passes.unroll_loops()(Input)
-        program = passes.convert_to_ssa()(program)
-        program = passes.flatten_call_expr()(program)
-        program = passes.split_chunked_loops()(program)
-        program = passes.interchange_chunk_loops()(program)
-        program = passes.outline_incore_scopes()(program)
-
-        # Verify no compute tensor ops in orchestration
-        for func in program.functions.values():
-            if func.func_type == ir.FunctionType.Orchestration:
-                func_str = python_print(func)
-                assert "tensor.add" not in func_str
-
-
-class TestOutlineNamedIncoreScopes:
-    """Test OutlineIncoreScopes pass with user-provided scope names."""
-
-    def test_outline_named_incore_scope(self):
-        """Test that user-provided name is used for the outlined function."""
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, name_hint="fused_add"):
-                    y: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
-                return y
-
-        @pl.program
-        class Expected:
-            @pl.function(type=pl.FunctionType.InCore)
-            def fused_add(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                y: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
-                return y
-
-            @pl.function(type=pl.FunctionType.Orchestration)
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                y: pl.Tensor[[64], pl.FP32] = self.fused_add(x)
-                return y
-
-        Before = passes.convert_to_ssa()(Before)
-        Expected = passes.convert_to_ssa()(Expected)
-        After = passes.outline_incore_scopes()(Before)
-        ir.assert_structural_equal(After, Expected)
-
-    def test_outline_mixed_named_and_unnamed_scopes(self):
-        """Test that unnamed scopes still get auto-generated names when mixed with named scopes."""
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(
-                self,
-                x: pl.Tensor[[64], pl.FP32],
-                y: pl.Tensor[[64], pl.FP32],
-            ) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, name_hint="first_kernel"):
-                    a: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
-                with pl.at(level=pl.Level.CORE_GROUP):
-                    b: pl.Tensor[[64], pl.FP32] = pl.add(y, a)
-                return b
-
-        @pl.program
-        class Expected:
-            @pl.function(type=pl.FunctionType.InCore)
-            def first_kernel(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                a: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
-                return a
-
-            @pl.function(type=pl.FunctionType.InCore)
-            def main_incore_1(
-                self,
-                y: pl.Tensor[[64], pl.FP32],
-                a: pl.Tensor[[64], pl.FP32],
-            ) -> pl.Tensor[[64], pl.FP32]:
-                b: pl.Tensor[[64], pl.FP32] = pl.add(y, a)
-                return b
-
-            @pl.function(type=pl.FunctionType.Orchestration)
-            def main(
-                self,
-                x: pl.Tensor[[64], pl.FP32],
-                y: pl.Tensor[[64], pl.FP32],
-            ) -> pl.Tensor[[64], pl.FP32]:
-                a: pl.Tensor[[64], pl.FP32] = self.first_kernel(x)
-                b: pl.Tensor[[64], pl.FP32] = self.main_incore_1(y, a)
-                return b
+                    z: pl.Tensor[[64], pl.FP32] = pl.mul(y, y)
+                return z
 
         Before = passes.convert_to_ssa()(Before)
-        Expected = passes.convert_to_ssa()(Expected)
         After = passes.outline_incore_scopes()(Before)
-        ir.assert_structural_equal(After, Expected)
 
-    def test_outline_duplicate_name_hint_auto_dedup(self):
-        """Test that duplicate name_hints are auto-deduplicated."""
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(
-                self,
-                x: pl.Tensor[[64], pl.FP32],
-                y: pl.Tensor[[64], pl.FP32],
-            ) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, name_hint="my_kernel"):
-                    a: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
-                with pl.at(level=pl.Level.CORE_GROUP, name_hint="my_kernel"):
-                    b: pl.Tensor[[64], pl.FP32] = pl.add(y, a)
-                return b
-
-        @pl.program
-        class Expected:
-            @pl.function(type=pl.FunctionType.InCore)
-            def my_kernel(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                a: pl.Tensor[[64], pl.FP32] = pl.add(x, x)
-                return a
-
-            @pl.function(type=pl.FunctionType.InCore)
-            def my_kernel_0(
-                self,
-                y: pl.Tensor[[64], pl.FP32],
-                a: pl.Tensor[[64], pl.FP32],
-            ) -> pl.Tensor[[64], pl.FP32]:
-                b: pl.Tensor[[64], pl.FP32] = pl.add(y, a)
-                return b
-
-            @pl.function(type=pl.FunctionType.Orchestration)
-            def main(
-                self,
-                x: pl.Tensor[[64], pl.FP32],
-                y: pl.Tensor[[64], pl.FP32],
-            ) -> pl.Tensor[[64], pl.FP32]:
-                a: pl.Tensor[[64], pl.FP32] = self.my_kernel(x)
-                b: pl.Tensor[[64], pl.FP32] = self.my_kernel_0(y, a)
-                return b
-
-        Before = passes.convert_to_ssa()(Before)
-        Expected = passes.convert_to_ssa()(Expected)
-        After = passes.outline_incore_scopes()(Before)
-        ir.assert_structural_equal(After, Expected)
+        func_types = {gv.name: func.func_type for gv, func in After.functions.items()}
+        assert func_types["main"] == ir.FunctionType.Orchestration
+        incore_count = sum(1 for t in func_types.values() if t == ir.FunctionType.InCore)
+        assert incore_count == 2
 
 
 if __name__ == "__main__":
diff --git a/tests/ut/ir/transforms/test_pass_manager.py b/tests/ut/ir/transforms/test_pass_manager.py
index 4065083b0..a04af594d 100644
--- a/tests/ut/ir/transforms/test_pass_manager.py
+++ b/tests/ut/ir/transforms/test_pass_manager.py
@@ -17,8 +17,6 @@
 from pypto.backend import BackendType
 
 TENSOR_ONLY_PASSES = [
-    "SplitChunkedLoops",
-    "InterchangeChunkLoops",
     "OutlineHierarchyScopes",
     "OutlineIncoreScopes",
     "OutlineClusterScopes",
diff --git a/tests/ut/ir/transforms/test_pass_pipeline.py b/tests/ut/ir/transforms/test_pass_pipeline.py
index cd73b407c..da7c23572 100644
--- a/tests/ut/ir/transforms/test_pass_pipeline.py
+++ b/tests/ut/ir/transforms/test_pass_pipeline.py
@@ -161,22 +161,22 @@ def test_before_mode_catches_false_ssa_claim(self):
             # Same Var assigned twice — genuine SSA violation
             program = _make_ssa_violating_program()
             with pytest.raises(Exception, match="Pre-verification failed"):
-                passes.outline_incore_scopes()(program)
+                passes.outline_hierarchy_scopes()(program)
 
     def test_before_mode_succeeds_when_property_holds(self):
         """BEFORE mode passes when the required property actually holds."""
         with passes.PassContext([passes.VerificationInstrument(passes.VerificationMode.BEFORE)]):
             program = _make_non_ssa_program()
             program = passes.convert_to_ssa()(program)
-            result = passes.outline_incore_scopes()(program)
+            result = passes.outline_hierarchy_scopes()(program)
             assert result is not None
 
     def test_empty_context_disables_verification(self):
         """Empty instrument list overrides conftest's verification context."""
         with passes.PassContext([]):
-            # OutlineIncoreScopes requires SSAForm, but empty context = no check
+            # OutlineHierarchyScopes requires SSAForm, but empty context = no check
             program = _make_non_ssa_program()
-            result = passes.outline_incore_scopes()(program)
+            result = passes.outline_hierarchy_scopes()(program)
             assert result is not None
 
     def test_before_and_after_succeeds_on_valid_pipeline(self):
@@ -193,7 +193,7 @@ def test_before_and_after_catches_pre_violation(self):
             # Same Var assigned twice — genuine SSA violation
             program = _make_ssa_violating_program()
             with pytest.raises(Exception, match="Pre-verification failed"):
-                passes.outline_incore_scopes()(program)
+                passes.outline_hierarchy_scopes()(program)
 
     def test_pipeline_with_context(self):
         """PassPipeline respects active PassContext instruments."""
diff --git a/tests/ut/ir/transforms/test_split_chunked_loops.py b/tests/ut/ir/transforms/test_split_chunked_loops.py
deleted file mode 100644
index a4a0e9d9f..000000000
--- a/tests/ut/ir/transforms/test_split_chunked_loops.py
+++ /dev/null
@@ -1,1423 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-
-"""Unit tests for SplitChunkedLoops pass."""
-
-import re
-from typing import cast
-
-import pypto.language as pl
-import pytest
-from pypto import ir, passes
-from pypto.ir.printer import python_print
-
-
-def _prepare_for_split(program):
-    """Run prerequisite passes to produce SSA input for SplitChunkedLoops."""
-    program = passes.unroll_loops()(program)
-    program = passes.convert_to_ssa()(program)
-    program = passes.flatten_call_expr()(program)
-    return program
-
-
-def _top_level_stmts(program: ir.Program) -> list[ir.Stmt]:
-    """Return the first function's top-level statements."""
-    func = list(program.functions.values())[0]
-    return list(cast(ir.SeqStmts, func.body).stmts)
-
-
-def _body_stmts(stmt: ir.Stmt) -> list[ir.Stmt]:
-    """Return child statements from a SeqStmts body."""
-    return list(cast(ir.SeqStmts, stmt).stmts)
-
-
-def _normalize_expected(program):
-    """Normalize Expected IR structure to match pass pipeline output.
-
-    The DSL-constructed Expected programs have a different statement nesting
-    than the pass pipeline output. This applies the same structural
-    normalization so assert_structural_equal can compare them.
-    """
-    return passes.normalize_stmt_structure()(program)
-
-
-class TestBasicChunking:
-    """Tests for basic loop chunking with SSA iter_args propagation."""
-
-    def test_divisible_chunk(self):
-        """Chunk a loop where trip_count is divisible by chunk_size."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i in pl.range(0, 10, 1, chunk=5, chunk_policy="leading_full"):
-                        x = pl.add(x, 1.0)
-                return x
-
-        Before = _prepare_for_split(Input)
-        After = passes.split_chunked_loops()(Before)
-
-        @pl.program
-        class Expected:
-            @pl.function(strict_ssa=True)
-            def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i_0_out, (x_iter_1_outer,) in pl.range(
-                        0, 2, 1, init_values=(x_0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                    ):
-                        for i_0_in, (x_iter_1_inner,) in pl.range(
-                            0,
-                            5,
-                            1,
-                            init_values=(x_iter_1_outer,),
-                            attrs={"loop_origin": pl.LoopOrigin.ChunkInner},
-                        ):
-                            x_3: pl.Tensor[[64], pl.FP32] = pl.add(x_iter_1_inner, 1.0)
-                            x_iter_1_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3)
-                        x_iter_1_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_iter_1_inner_rv)
-                return x_iter_1_outer_rv
-
-        ir.assert_structural_equal(After, _normalize_expected(Expected))
-
-    def test_non_divisible_chunk(self):
-        """Chunk a loop where trip_count is NOT divisible by chunk_size."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i in pl.range(0, 7, 1, chunk=5, chunk_policy="leading_full"):
-                        x = pl.add(x, 1.0)
-                return x
-
-        Before = _prepare_for_split(Input)
-        After = passes.split_chunked_loops()(Before)
-
-        @pl.program
-        class Expected:
-            @pl.function(strict_ssa=True)
-            def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i_0_out, (x_iter_1_outer,) in pl.range(
-                        0, 1, 1, init_values=(x_0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                    ):  # noqa: E501
-                        for i_0_in, (x_iter_1_inner,) in pl.range(
-                            0,
-                            5,
-                            1,
-                            init_values=(x_iter_1_outer,),
-                            attrs={"loop_origin": pl.LoopOrigin.ChunkInner},
-                        ):  # noqa: E501
-                            x_3: pl.Tensor[[64], pl.FP32] = pl.add(x_iter_1_inner, 1.0)
-                            x_iter_1_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3)
-                        x_iter_1_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_iter_1_inner_rv)
-                    for i_0_rem, (x_iter_1_rem,) in pl.range(
-                        0,
-                        2,
-                        1,
-                        init_values=(x_iter_1_outer_rv,),
-                        attrs={"loop_origin": pl.LoopOrigin.ChunkRemainder},
-                    ):  # noqa: E501
-                        x_3_f: pl.Tensor[[64], pl.FP32] = pl.add(x_iter_1_rem, 1.0)
-                        x_iter_1_rem_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3_f)
-                return x_iter_1_rem_rv
-
-        ir.assert_structural_equal(After, _normalize_expected(Expected))
-
-    def test_single_chunk(self):
-        """Chunk a loop where trip_count equals chunk_size."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i in pl.range(0, 5, 1, chunk=5, chunk_policy="leading_full"):
-                        x = pl.add(x, 1.0)
-                return x
-
-        Before = _prepare_for_split(Input)
-        After = passes.split_chunked_loops()(Before)
-
-        @pl.program
-        class Expected:
-            @pl.function(strict_ssa=True)
-            def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i_0_out, (x_iter_1_outer,) in pl.range(
-                        0, 1, 1, init_values=(x_0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                    ):
-                        for i_0_in, (x_iter_1_inner,) in pl.range(
-                            0,
-                            5,
-                            1,
-                            init_values=(x_iter_1_outer,),
-                            attrs={"loop_origin": pl.LoopOrigin.ChunkInner},
-                        ):
-                            x_3: pl.Tensor[[64], pl.FP32] = pl.add(x_iter_1_inner, 1.0)
-                            x_iter_1_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3)
-                        x_iter_1_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_iter_1_inner_rv)
-                return x_iter_1_outer_rv
-
-        ir.assert_structural_equal(After, _normalize_expected(Expected))
-
-
-class TestChunkingWithStep:
-    """Tests for chunking with non-unit step."""
-
-    def test_step_2(self):
-        """Chunk with step=2: range(0, 20, 2, chunk=5) -> trip_count=10."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i in pl.range(0, 20, 2, chunk=5, chunk_policy="leading_full"):
-                        x = pl.add(x, 1.0)
-                return x
-
-        Before = _prepare_for_split(Input)
-        After = passes.split_chunked_loops()(Before)
-
-        @pl.program
-        class Expected:
-            @pl.function(strict_ssa=True)
-            def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i_0_out, (x_iter_1_outer,) in pl.range(
-                        0, 2, 1, init_values=(x_0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                    ):
-                        for i_0_in, (x_iter_1_inner,) in pl.range(
-                            0,
-                            5,
-                            1,
-                            init_values=(x_iter_1_outer,),
-                            attrs={"loop_origin": pl.LoopOrigin.ChunkInner},
-                        ):
-                            x_3: pl.Tensor[[64], pl.FP32] = pl.add(x_iter_1_inner, 1.0)
-                            x_iter_1_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3)
-                        x_iter_1_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_iter_1_inner_rv)
-                return x_iter_1_outer_rv
-
-        ir.assert_structural_equal(After, _normalize_expected(Expected))
-
-    def test_chunk_all_remainder(self):
-        """Chunk where trip_count < chunk_size -> only remainder loop."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i in pl.range(0, 3, 1, chunk=5, chunk_policy="leading_full"):
-                        x = pl.add(x, 1.0)
-                return x
-
-        Before = _prepare_for_split(Input)
-        After = passes.split_chunked_loops()(Before)
-
-        @pl.program
-        class Expected:
-            @pl.function(strict_ssa=True)
-            def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i_0_rem, (x_iter_1_rem,) in pl.range(
-                        0, 3, 1, init_values=(x_0,), attrs={"loop_origin": pl.LoopOrigin.ChunkRemainder}
-                    ):
-                        x_3: pl.Tensor[[64], pl.FP32] = pl.add(x_iter_1_rem, 1.0)
-                        x_iter_1_rem_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3)
-                return x_iter_1_rem_rv
-
-        ir.assert_structural_equal(After, _normalize_expected(Expected))
-
-
-class TestChunkingWithKind:
-    """Tests for chunking with different loop kinds."""
-
-    def test_parallel_chunk(self):
-        """Chunk a parallel loop: both inner and outer loops should be Parallel."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                        x = pl.add(x, 1.0)
-                return x
-
-        Before = _prepare_for_split(Input)
-        After = passes.split_chunked_loops()(Before)
-
-        @pl.program
-        class Expected:
-            @pl.function(strict_ssa=True)
-            def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i_0_out, (x_iter_1_outer,) in pl.parallel(
-                        0, 2, 1, init_values=(x_0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                    ):
-                        for i_0_in, (x_iter_1_inner,) in pl.parallel(
-                            0,
-                            4,
-                            1,
-                            init_values=(x_iter_1_outer,),
-                            attrs={"loop_origin": pl.LoopOrigin.ChunkInner},
-                        ):
-                            x_3: pl.Tensor[[64], pl.FP32] = pl.add(x_iter_1_inner, 1.0)
-                            x_iter_1_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3)
-                        x_iter_1_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_iter_1_inner_rv)
-                return x_iter_1_outer_rv
-
-        ir.assert_structural_equal(After, _normalize_expected(Expected))
-
-    @pytest.mark.filterwarnings("ignore:.*RoundtripInstrument.*IR not printable:UserWarning")
-    def test_unroll_chunk(self):
-        """Chunk an unroll loop: both inner and outer loops are Unroll.
-
-        Since the DSL does not support pl.unroll() with init_values,
-        we verify the IR structure properties directly instead of
-        using structural equality.
-        """
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i in pl.unroll(0, 12, 1, chunk=4, chunk_policy="leading_full"):
-                        x = pl.add(x, 1.0)
-                return x
-
-        Before = _prepare_for_split(Input)
-        After = passes.split_chunked_loops()(Before)
-
-        # Extract the function body
-        stmts = _top_level_stmts(After)
-
-        # Body should be SeqStmts: [auto_incore_scope, return]
-        assert len(stmts) == 2  # auto_incore scope + return
-
-        # The first stmt is the AutoInCore scope
-        scope = cast(ir.ScopeStmt, stmts[0])
-        assert scope.scope_kind == ir.ScopeKind.AutoInCore
-
-        # Inside the scope is the outer for loop
-        outer_for = cast(ir.ForStmt, scope.body)
-        assert outer_for.kind == ir.ForKind.Unroll
-        assert len(outer_for.iter_args) == 1
-        assert len(outer_for.return_vars) == 1
-
-        # Outer loop bounds: range(0, 3, 1) — 12/4 = 3 full chunks
-        assert cast(ir.ConstInt, outer_for.start).value == 0
-        assert cast(ir.ConstInt, outer_for.stop).value == 3
-
-        # Inner loop is inside outer body (SeqStmts: [inner_for, yield])
-        outer_body_stmts = _body_stmts(outer_for.body)
-        inner_for = cast(ir.ForStmt, outer_body_stmts[0])
-        assert inner_for.kind == ir.ForKind.Unroll
-        assert len(inner_for.iter_args) == 1
-        assert len(inner_for.return_vars) == 1
-
-        # Inner loop bounds: range(0, 4, 1)
-        assert cast(ir.ConstInt, inner_for.start).value == 0
-        assert cast(ir.ConstInt, inner_for.stop).value == 4
-
-
-class TestPrinterRoundTrip:
-    """Tests for printer output with chunk kwargs."""
-
-    def test_chunk_in_printer(self):
-        """Verify that chunk kwarg is printed correctly."""
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i in pl.range(0, 10, 1, chunk=5, chunk_policy="leading_full"):
-                        x = pl.add(x, 1.0)
-                return x
-
-        printed = python_print(Before)
-        assert "chunk=5" in printed
-
-    def test_parallel_chunk_in_printer(self):
-        """Verify parallel chunk kwarg is printed."""
-
-        @pl.program
-        class Before:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i in pl.parallel(0, 8, 1, chunk=4, chunk_policy="leading_full"):
-                        x = pl.add(x, 1.0)
-                return x
-
-        printed = python_print(Before)
-        assert "chunk=4" in printed
-        assert "pl.parallel" in printed
-
-
-class TestParserErrors:
-    """Tests for parser validation of chunk arguments."""
-
-    def test_chunk_with_init_values_allowed(self):
-        """chunk + init_values should be allowed (not raise parser error)."""
-
-        @pl.program
-        class Good:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i, (s,) in pl.range(10, init_values=(x,), chunk=5, chunk_policy="leading_full"):
-                        s = pl.add(s, 1.0)
-                        s = pl.yield_(s)
-                return x
-
-    def test_chunk_zero_error(self):
-        """chunk=0 should raise parser error."""
-        with pytest.raises(Exception, match="chunk must be a positive integer"):
-
-            @pl.program
-            class Bad:
-                @pl.function
-                def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                    with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                        for i in pl.range(0, 10, 1, chunk=0, chunk_policy="leading_full"):
-                            x = pl.add(x, 1.0)
-                    return x
-
-    def test_chunk_negative_error(self):
-        """chunk=-1 should raise parser error."""
-        with pytest.raises(Exception, match="chunk must be a positive integer"):
-
-            @pl.program
-            class Bad:
-                @pl.function
-                def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                    with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                        for i in pl.range(0, 10, 1, chunk=-1):
-                            x = pl.add(x, 1.0)
-                    return x
-
-
-class TestLoopOrigin:
-    """Tests for LoopOrigin annotation set by SplitChunkedLoops."""
-
-    def _get_func_body_stmts(self, program):
-        """Get the top-level statements from the first function's body."""
-        return _top_level_stmts(program)
-
-    def _get_auto_incore_body_stmts(self, program):
-        """Get statements inside the AutoInCore scope."""
-        stmts = self._get_func_body_stmts(program)
-        # First stmt should be AutoInCore scope
-        scope = cast(ir.ScopeStmt, stmts[0])
-        assert scope.scope_kind == ir.ScopeKind.AutoInCore
-        body = scope.body
-        # Body may be a single stmt or SeqStmts
-        if hasattr(body, "stmts"):
-            return _body_stmts(body)
-        return [body]
-
-    def test_divisible_chunk_origin(self):
-        """Verify outer=ChunkOuter, inner=ChunkInner for divisible chunks."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i in pl.range(0, 10, 1, chunk=5, chunk_policy="leading_full"):
-                        x = pl.add(x, 1.0)
-                return x
-
-        Before = _prepare_for_split(Input)
-        After = passes.split_chunked_loops()(Before)
-
-        inner_stmts = self._get_auto_incore_body_stmts(After)
-        outer_for = cast(ir.ForStmt, inner_stmts[0])
-
-        assert outer_for.attrs.get("loop_origin") == ir.LoopOrigin.ChunkOuter
-
-        # Inner loop is inside outer body
-        outer_body_stmts = _body_stmts(outer_for.body)
-        inner_for = cast(ir.ForStmt, outer_body_stmts[0])
-        assert inner_for.attrs.get("loop_origin") == ir.LoopOrigin.ChunkInner
-
-    def test_non_divisible_chunk_origin(self):
-        """Verify outer=ChunkOuter, inner=ChunkInner, remainder=ChunkRemainder."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i in pl.range(0, 7, 1, chunk=5, chunk_policy="leading_full"):
-                        x = pl.add(x, 1.0)
-                return x
-
-        Before = _prepare_for_split(Input)
-        After = passes.split_chunked_loops()(Before)
-
-        inner_stmts = self._get_auto_incore_body_stmts(After)
-        # stmts: [outer_for, remainder_for]
-        outer_for = cast(ir.ForStmt, inner_stmts[0])
-        remainder_for = cast(ir.ForStmt, inner_stmts[1])
-
-        assert outer_for.attrs.get("loop_origin") == ir.LoopOrigin.ChunkOuter
-
-        outer_body_stmts = _body_stmts(outer_for.body)
-        inner_for = cast(ir.ForStmt, outer_body_stmts[0])
-        assert inner_for.attrs.get("loop_origin") == ir.LoopOrigin.ChunkInner
-
-        assert remainder_for.attrs.get("loop_origin") == ir.LoopOrigin.ChunkRemainder
-
-    def test_all_remainder_origin(self):
-        """Verify remainder=ChunkRemainder when trip_count < chunk_size."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i in pl.range(0, 3, 1, chunk=5, chunk_policy="leading_full"):
-                        x = pl.add(x, 1.0)
-                return x
-
-        Before = _prepare_for_split(Input)
-        After = passes.split_chunked_loops()(Before)
-
-        inner_stmts = self._get_auto_incore_body_stmts(After)
-        remainder_for = cast(ir.ForStmt, inner_stmts[0])
-        assert remainder_for.attrs.get("loop_origin") == ir.LoopOrigin.ChunkRemainder
-
-    def test_non_chunked_loop_origin(self):
-        """Verify regular (non-chunked) loops have Original origin."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                for i in pl.range(0, 10, 1):
-                    x = pl.add(x, 1.0)
-                return x
-
-        Before = _prepare_for_split(Input)
-        After = passes.split_chunked_loops()(Before)
-
-        stmts = self._get_func_body_stmts(After)
-        for_stmt = cast(ir.ForStmt, stmts[0])
-        assert "loop_origin" not in for_stmt.attrs
-
-
-class TestNestedChunking:
-    """Tests for nested chunked loops with iter_args propagation."""
-
-    def test_nested_outer_divisible_inner_remainder(self):
-        """Nested chunks: outer divisible, inner only remainder.
-
-        Reproduces the bug where inner remainder loop's init_values
-        referenced the original (unsplit) iter_arg instead of the
-        inner iter_arg from the outer loop's split.
-        """
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i in pl.parallel(8, chunk=4, chunk_policy="leading_full"):
-                        for j in pl.parallel(1, chunk=2, chunk_policy="leading_full"):
-                            x = pl.add(x, 1.0)
-                return x
-
-        Before = _prepare_for_split(Input)
-        After = passes.split_chunked_loops()(Before)
-
-        @pl.program
-        class Expected:
-            @pl.function(strict_ssa=True)
-            def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i_0_out, (x_iter_1_outer,) in pl.parallel(
-                        0, 2, 1, init_values=(x_0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                    ):
-                        for i_0_in, (x_iter_1_inner,) in pl.parallel(
-                            0,
-                            4,
-                            1,
-                            init_values=(x_iter_1_outer,),
-                            attrs={"loop_origin": pl.LoopOrigin.ChunkInner},
-                        ):
-                            for j_0_rem, (x_iter_3_rem,) in pl.parallel(
-                                0,
-                                1,
-                                1,
-                                init_values=(x_iter_1_inner,),
-                                attrs={"loop_origin": pl.LoopOrigin.ChunkRemainder},
-                            ):
-                                x_5: pl.Tensor[[64], pl.FP32] = pl.tensor.add(x_iter_3_rem, 1.0)
-                                x_iter_3_rem_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_5)
-                            x_iter_1_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_iter_3_rem_rv)
-                        x_iter_1_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_iter_1_inner_rv)
-                return x_iter_1_outer_rv
-
-        ir.assert_structural_equal(After, _normalize_expected(Expected))
-
-    def test_nested_both_divisible(self):
-        """Nested chunks: both outer and inner divisible."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i in pl.parallel(8, chunk=4, chunk_policy="leading_full"):
-                        for j in pl.parallel(12, chunk=4, chunk_policy="leading_full"):
-                            x = pl.add(x, 1.0)
-                return x
-
-        Before = _prepare_for_split(Input)
-        After = passes.split_chunked_loops()(Before)
-
-        @pl.program
-        class Expected:
-            @pl.function(strict_ssa=True)
-            def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i_0_out, (x_iter_1_outer,) in pl.parallel(
-                        0, 2, 1, init_values=(x_0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                    ):
-                        for i_0_in, (x_iter_1_inner,) in pl.parallel(
-                            0,
-                            4,
-                            1,
-                            init_values=(x_iter_1_outer,),
-                            attrs={"loop_origin": pl.LoopOrigin.ChunkInner},
-                        ):
-                            for j_0_out, (x_iter_3_outer,) in pl.parallel(
-                                0,
-                                3,
-                                1,
-                                init_values=(x_iter_1_inner,),
-                                attrs={"loop_origin": pl.LoopOrigin.ChunkOuter},
-                            ):
-                                for j_0_in, (x_iter_3_inner,) in pl.parallel(
-                                    0,
-                                    4,
-                                    1,
-                                    init_values=(x_iter_3_outer,),
-                                    attrs={"loop_origin": pl.LoopOrigin.ChunkInner},
-                                ):
-                                    x_5: pl.Tensor[[64], pl.FP32] = pl.tensor.add(x_iter_3_inner, 1.0)
-                                    x_iter_3_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_5)
-                                x_iter_3_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_iter_3_inner_rv)
-                            x_iter_1_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_iter_3_outer_rv)
-                        x_iter_1_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_iter_1_inner_rv)
-                return x_iter_1_outer_rv
-
-        ir.assert_structural_equal(After, _normalize_expected(Expected))
-
-    def test_nested_both_remainder(self):
-        """Nested chunks: both outer and inner have remainders.
-
-        Verifies init_values are correctly substituted in all paths:
-        outer-inner, outer-remainder, remainder-inner, remainder-remainder.
-        """
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i in pl.parallel(6, chunk=4, chunk_policy="leading_full"):
-                        for j in pl.parallel(3, chunk=2, chunk_policy="leading_full"):
-                            x = pl.add(x, 1.0)
-                return x
-
-        Before = _prepare_for_split(Input)
-        After = passes.split_chunked_loops()(Before)
-
-        printed = python_print(After)
-        init_refs = re.findall(r"init_values=\((\w+),\)", printed)
-        for ref in init_refs:
-            assert ref != "x__iter_v1", (
-                "Found bare 'x__iter_v1' in init_values; should be a chunk-qualified iter name."
-            )
-            assert ref != "x__iter_v3", (
-                "Found bare 'x__iter_v3' in init_values; should be a chunk-qualified iter name."
-            )
-
-
-class TestDynamicChunking:
-    """Tests for chunked loops where start/stop are dynamic (runtime) scalars."""
-
-    @staticmethod
-    def _split_and_simplify(program):
-        """Run prerequisite passes, split chunked loops, and simplify expressions."""
-        prepared = _prepare_for_split(program)
-        split = passes.split_chunked_loops()(prepared)
-        return passes.simplify()(split)
-
-    def test_dynamic_stop(self):
-        """Dynamic stop: outer+inner+remainder with FloorDiv/FloorMod bounds."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32], n: pl.Scalar[pl.INDEX]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i in pl.range(0, n, 1, chunk=4, chunk_policy="leading_full"):
-                        x = pl.add(x, 1.0)
-                return x
-
-        After = self._split_and_simplify(Input)
-
-        @pl.program
-        class Expected:
-            @pl.function(strict_ssa=True)
-            def main(
-                self, x_0: pl.Tensor[[64], pl.FP32], n_0: pl.Scalar[pl.INDEX]
-            ) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i_out, (x_outer,) in pl.range(
-                        0, n_0 // 4, 1, init_values=(x_0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                    ):
-                        for i_in, (x_inner,) in pl.range(
-                            0, 4, 1, init_values=(x_outer,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                        ):
-                            x_3: pl.Tensor[[64], pl.FP32] = pl.add(x_inner, 1.0)
-                            x_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3)
-                        x_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner_rv)
-                    for i_rem, (x_rem,) in pl.range(
-                        0,
-                        n_0 % 4,
-                        1,
-                        init_values=(x_outer_rv,),
-                        attrs={"loop_origin": pl.LoopOrigin.ChunkRemainder},
-                    ):
-                        x_4: pl.Tensor[[64], pl.FP32] = pl.add(x_rem, 1.0)
-                        x_rem_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_4)
-                return x_rem_rv
-
-        ir.assert_structural_equal(After, _normalize_expected(Expected))
-
-    def test_dynamic_start_and_stop(self):
-        """Both start and stop are dynamic."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(
-                self,
-                x: pl.Tensor[[64], pl.FP32],
-                lo: pl.Scalar[pl.INDEX],
-                hi: pl.Scalar[pl.INDEX],
-            ) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i in pl.range(lo, hi, 1, chunk=4, chunk_policy="leading_full"):
-                        x = pl.add(x, 1.0)
-                return x
-
-        After = self._split_and_simplify(Input)
-
-        @pl.program
-        class Expected:
-            @pl.function(strict_ssa=True)
-            def main(
-                self,
-                x_0: pl.Tensor[[64], pl.FP32],
-                lo_0: pl.Scalar[pl.INDEX],
-                hi_0: pl.Scalar[pl.INDEX],
-            ) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i_out, (x_outer,) in pl.range(
-                        0,
-                        pl.max(hi_0 - lo_0, 0) // 4,
-                        1,
-                        init_values=(x_0,),
-                        attrs={"loop_origin": pl.LoopOrigin.ChunkOuter},
-                    ):
-                        for i_in, (x_inner,) in pl.range(
-                            0, 4, 1, init_values=(x_outer,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                        ):
-                            x_3: pl.Tensor[[64], pl.FP32] = pl.add(x_inner, 1.0)
-                            x_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3)
-                        x_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner_rv)
-                    for i_rem, (x_rem,) in pl.range(
-                        0,
-                        pl.max(hi_0 - lo_0, 0) % 4,
-                        1,
-                        init_values=(x_outer_rv,),
-                        attrs={"loop_origin": pl.LoopOrigin.ChunkRemainder},
-                    ):
-                        x_4: pl.Tensor[[64], pl.FP32] = pl.add(x_rem, 1.0)
-                        x_rem_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_4)
-                return x_rem_rv
-
-        ir.assert_structural_equal(After, _normalize_expected(Expected))
-
-    def test_dynamic_stop_parallel(self):
-        """Dynamic stop with pl.parallel should also work."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32], n: pl.Scalar[pl.INDEX]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i in pl.parallel(0, n, 1, chunk=4, chunk_policy="leading_full"):
-                        x = pl.add(x, 1.0)
-                return x
-
-        After = self._split_and_simplify(Input)
-
-        @pl.program
-        class Expected:
-            @pl.function(strict_ssa=True)
-            def main(
-                self, x_0: pl.Tensor[[64], pl.FP32], n_0: pl.Scalar[pl.INDEX]
-            ) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i_out, (x_outer,) in pl.parallel(
-                        0, n_0 // 4, 1, init_values=(x_0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                    ):
-                        for i_in, (x_inner,) in pl.parallel(
-                            0, 4, 1, init_values=(x_outer,), attrs={"loop_origin": pl.LoopOrigin.ChunkInner}
-                        ):
-                            x_3: pl.Tensor[[64], pl.FP32] = pl.add(x_inner, 1.0)
-                            x_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3)
-                        x_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner_rv)
-                    for i_rem, (x_rem,) in pl.parallel(
-                        0,
-                        n_0 % 4,
-                        1,
-                        init_values=(x_outer_rv,),
-                        attrs={"loop_origin": pl.LoopOrigin.ChunkRemainder},
-                    ):
-                        x_4: pl.Tensor[[64], pl.FP32] = pl.add(x_rem, 1.0)
-                        x_rem_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_4)
-                return x_rem_rv
-
-        ir.assert_structural_equal(After, _normalize_expected(Expected))
-
-    def test_static_still_works(self):
-        """Regression: static bounds should continue to produce same IR as before."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i in pl.range(0, 10, 1, chunk=5, chunk_policy="leading_full"):
-                        x = pl.add(x, 1.0)
-                return x
-
-        Before = _prepare_for_split(Input)
-        After = passes.split_chunked_loops()(Before)
-
-        @pl.program
-        class Expected:
-            @pl.function(strict_ssa=True)
-            def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i_0_out, (x_iter_1_outer,) in pl.range(
-                        0, 2, 1, init_values=(x_0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                    ):
-                        for i_0_in, (x_iter_1_inner,) in pl.range(
-                            0,
-                            5,
-                            1,
-                            init_values=(x_iter_1_outer,),
-                            attrs={"loop_origin": pl.LoopOrigin.ChunkInner},
-                        ):
-                            x_3: pl.Tensor[[64], pl.FP32] = pl.add(x_iter_1_inner, 1.0)
-                            x_iter_1_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3)
-                        x_iter_1_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_iter_1_inner_rv)
-                return x_iter_1_outer_rv
-
-        ir.assert_structural_equal(After, _normalize_expected(Expected))
-
-
-class TestGuardedPolicy:
-    """Tests for the `guarded` chunk policy.
-
-    Guarded mode emits a single outer loop over ceil(T/C) chunks and an inner
-    loop of size C, with the body wrapped in `if idx < stop` so out-of-range
-    iterations become no-ops. With iter_args, the guard becomes an IfStmt phi
-    whose else branch passes the inner iter_args through unchanged.
-    """
-
-    @staticmethod
-    def _split_and_simplify(program):
-        """Prepare, split, then simplify so conditions compare cleanly."""
-        prepared = _prepare_for_split(program)
-        split = passes.split_chunked_loops()(prepared)
-        return passes.simplify()(split)
-
-    def test_guarded_is_default(self):
-        """Omitting chunk_policy selects Guarded."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for _i in pl.range(7, chunk=5):
-                        x = pl.add(x, 1.0)
-                return x
-
-        @pl.program
-        class InputExplicit:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for _i in pl.range(7, chunk=5, chunk_policy="guarded"):
-                        x = pl.add(x, 1.0)
-                return x
-
-        # Default and explicit "guarded" must produce identical IR.
-        After = passes.split_chunked_loops()(_prepare_for_split(Input))
-        AfterExplicit = passes.split_chunked_loops()(_prepare_for_split(InputExplicit))
-        ir.assert_structural_equal(After, AfterExplicit)
-
-    def test_guarded_divisible_iter_args(self):
-        """Static bound, trip_count divisible by chunk_size, with iter_args."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for _i in pl.range(10, chunk=5, chunk_policy="guarded"):
-                        x = pl.add(x, 1.0)
-                return x
-
-        After = self._split_and_simplify(Input)
-
-        @pl.program
-        class Expected:
-            @pl.function(strict_ssa=True)
-            def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i_out, (x_outer,) in pl.range(
-                        2, init_values=(x_0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                    ):
-                        for i_in, (x_inner,) in pl.range(
-                            5,
-                            init_values=(x_outer,),
-                            attrs={"loop_origin": pl.LoopOrigin.ChunkInner},
-                        ):
-                            if i_out * 5 + i_in < 10:
-                                x_3: pl.Tensor[[64], pl.FP32] = pl.add(x_inner, 1.0)
-                                x_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3)
-                            else:
-                                x_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner)
-                            x_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_if)
-                        x_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner_rv)
-                return x_outer_rv
-
-        ir.assert_structural_equal(After, _normalize_expected(Expected))
-
-    def test_guarded_non_divisible_iter_args(self):
-        """Static bound, trip_count NOT divisible by chunk_size: ceil(7/5)=2 outer chunks.
-
-        The guard `idx < 7` disables lanes 7..9 in the second outer chunk,
-        and the else branch threads the inner iter_args through unchanged.
-        """
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for _i in pl.range(7, chunk=5, chunk_policy="guarded"):
-                        x = pl.add(x, 1.0)
-                return x
-
-        After = self._split_and_simplify(Input)
-
-        @pl.program
-        class Expected:
-            @pl.function(strict_ssa=True)
-            def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i_out, (x_outer,) in pl.range(
-                        2, init_values=(x_0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                    ):
-                        for i_in, (x_inner,) in pl.range(
-                            5,
-                            init_values=(x_outer,),
-                            attrs={"loop_origin": pl.LoopOrigin.ChunkInner},
-                        ):
-                            if i_out * 5 + i_in < 7:
-                                x_3: pl.Tensor[[64], pl.FP32] = pl.add(x_inner, 1.0)
-                                x_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3)
-                            else:
-                                x_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner)
-                            x_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_if)
-                        x_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner_rv)
-                return x_outer_rv
-
-        ir.assert_structural_equal(After, _normalize_expected(Expected))
-
-    def test_guarded_trip_less_than_chunk(self):
-        """trip_count < chunk_size: ceil(3/5)=1 outer chunk, inner guard masks lanes >= 3."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for _i in pl.range(3, chunk=5, chunk_policy="guarded"):
-                        x = pl.add(x, 1.0)
-                return x
-
-        After = self._split_and_simplify(Input)
-
-        @pl.program
-        class Expected:
-            @pl.function(strict_ssa=True)
-            def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i_out, (x_outer,) in pl.range(
-                        1, init_values=(x_0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                    ):
-                        for i_in, (x_inner,) in pl.range(
-                            5,
-                            init_values=(x_outer,),
-                            attrs={"loop_origin": pl.LoopOrigin.ChunkInner},
-                        ):
-                            # Simplify proves i_out is always 0 (outer range [0,1)).
-                            if i_in < 3:
-                                x_3: pl.Tensor[[64], pl.FP32] = pl.add(x_inner, 1.0)
-                                x_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3)
-                            else:
-                                x_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner)
-                            x_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_if)
-                        x_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner_rv)
-                return x_outer_rv
-
-        ir.assert_structural_equal(After, _normalize_expected(Expected))
-
-    def test_guarded_no_iter_args(self):
-        """No iter_args: IfStmt has no phi and no else branch — body runs or is skipped."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for _i in pl.range(7, chunk=5, chunk_policy="guarded"):
-                        _tmp = pl.add(x, 1.0)
-                return x
-
-        After = self._split_and_simplify(Input)
-
-        @pl.program
-        class Expected:
-            @pl.function(strict_ssa=True)
-            def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i_out in pl.range(2, attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}):
-                        for i_in in pl.range(5, attrs={"loop_origin": pl.LoopOrigin.ChunkInner}):
-                            if i_out * 5 + i_in < 7:
-                                _tmp: pl.Tensor[[64], pl.FP32] = pl.add(x_0, 1.0)
-                return x_0
-
-        ir.assert_structural_equal(After, _normalize_expected(Expected))
-
-    def test_guarded_with_step(self):
-        """Non-unit step: guard compares `idx * step < stop`, idx = (out*C + in)."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for _i in pl.range(0, 20, 2, chunk=5, chunk_policy="guarded"):
-                        x = pl.add(x, 1.0)
-                return x
-
-        After = self._split_and_simplify(Input)
-
-        @pl.program
-        class Expected:
-            @pl.function(strict_ssa=True)
-            def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i_out, (x_outer,) in pl.range(
-                        2, init_values=(x_0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                    ):
-                        for i_in, (x_inner,) in pl.range(
-                            5,
-                            init_values=(x_outer,),
-                            attrs={"loop_origin": pl.LoopOrigin.ChunkInner},
-                        ):
-                            if (i_out * 5 + i_in) * 2 < 20:
-                                x_3: pl.Tensor[[64], pl.FP32] = pl.add(x_inner, 1.0)
-                                x_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3)
-                            else:
-                                x_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner)
-                            x_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_if)
-                        x_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner_rv)
-                return x_outer_rv
-
-        ir.assert_structural_equal(After, _normalize_expected(Expected))
-
-    def test_guarded_parallel(self):
-        """pl.parallel: both outer and inner guarded loops are Parallel kind."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for _i in pl.parallel(8, chunk=4, chunk_policy="guarded"):
-                        x = pl.add(x, 1.0)
-                return x
-
-        After = self._split_and_simplify(Input)
-
-        @pl.program
-        class Expected:
-            @pl.function(strict_ssa=True)
-            def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i_out, (x_outer,) in pl.parallel(
-                        2, init_values=(x_0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                    ):
-                        for i_in, (x_inner,) in pl.parallel(
-                            4,
-                            init_values=(x_outer,),
-                            attrs={"loop_origin": pl.LoopOrigin.ChunkInner},
-                        ):
-                            if i_out * 4 + i_in < 8:
-                                x_3: pl.Tensor[[64], pl.FP32] = pl.add(x_inner, 1.0)
-                                x_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3)
-                            else:
-                                x_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner)
-                            x_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_if)
-                        x_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner_rv)
-                return x_outer_rv
-
-        ir.assert_structural_equal(After, _normalize_expected(Expected))
-
-    def test_guarded_dynamic_stop(self):
-        """Dynamic stop `n`: outer count = ceil(n/4) = (n + 3) // 4."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32], n: pl.Scalar[pl.INDEX]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for _i in pl.range(n, chunk=4, chunk_policy="guarded"):
-                        x = pl.add(x, 1.0)
-                return x
-
-        After = self._split_and_simplify(Input)
-
-        @pl.program
-        class Expected:
-            @pl.function(strict_ssa=True)
-            def main(
-                self, x_0: pl.Tensor[[64], pl.FP32], n_0: pl.Scalar[pl.INDEX]
-            ) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i_out, (x_outer,) in pl.range(
-                        (n_0 + 3) // 4,
-                        init_values=(x_0,),
-                        attrs={"loop_origin": pl.LoopOrigin.ChunkOuter},
-                    ):
-                        for i_in, (x_inner,) in pl.range(
-                            4,
-                            init_values=(x_outer,),
-                            attrs={"loop_origin": pl.LoopOrigin.ChunkInner},
-                        ):
-                            if i_out * 4 + i_in < n_0:
-                                x_3: pl.Tensor[[64], pl.FP32] = pl.add(x_inner, 1.0)
-                                x_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3)
-                            else:
-                                x_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner)
-                            x_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_if)
-                        x_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner_rv)
-                return x_outer_rv
-
-        ir.assert_structural_equal(After, _normalize_expected(Expected))
-
-    def test_guarded_dynamic_start_and_stop(self):
-        """Dynamic start AND stop: outer count = ceil(max(hi-lo, 0) / 4)."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(
-                self,
-                x: pl.Tensor[[64], pl.FP32],
-                lo: pl.Scalar[pl.INDEX],
-                hi: pl.Scalar[pl.INDEX],
-            ) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for _i in pl.range(lo, hi, 1, chunk=4, chunk_policy="guarded"):
-                        x = pl.add(x, 1.0)
-                return x
-
-        After = self._split_and_simplify(Input)
-
-        @pl.program
-        class Expected:
-            @pl.function(strict_ssa=True)
-            def main(
-                self,
-                x_0: pl.Tensor[[64], pl.FP32],
-                lo_0: pl.Scalar[pl.INDEX],
-                hi_0: pl.Scalar[pl.INDEX],
-            ) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i_out, (x_outer,) in pl.range(
-                        (pl.max(hi_0 - lo_0, 0) + 3) // 4,
-                        init_values=(x_0,),
-                        attrs={"loop_origin": pl.LoopOrigin.ChunkOuter},
-                    ):
-                        for i_in, (x_inner,) in pl.range(
-                            4,
-                            init_values=(x_outer,),
-                            attrs={"loop_origin": pl.LoopOrigin.ChunkInner},
-                        ):
-                            if lo_0 + (i_out * 4 + i_in) < hi_0:
-                                x_3: pl.Tensor[[64], pl.FP32] = pl.add(x_inner, 1.0)
-                                x_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3)
-                            else:
-                                x_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner)
-                            x_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_if)
-                        x_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner_rv)
-                return x_outer_rv
-
-        ir.assert_structural_equal(After, _normalize_expected(Expected))
-
-    def test_guarded_dynamic_no_iter_args(self):
-        """Dynamic bound with no iter_args: IfStmt has no phi."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32], n: pl.Scalar[pl.INDEX]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for _i in pl.range(n, chunk=4, chunk_policy="guarded"):
-                        _tmp = pl.add(x, 1.0)
-                return x
-
-        After = self._split_and_simplify(Input)
-
-        @pl.program
-        class Expected:
-            @pl.function(strict_ssa=True)
-            def main(
-                self, x_0: pl.Tensor[[64], pl.FP32], n_0: pl.Scalar[pl.INDEX]
-            ) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i_out in pl.range((n_0 + 3) // 4, attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}):
-                        for i_in in pl.range(4, attrs={"loop_origin": pl.LoopOrigin.ChunkInner}):
-                            if i_out * 4 + i_in < n_0:
-                                _tmp: pl.Tensor[[64], pl.FP32] = pl.add(x_0, 1.0)
-                return x_0
-
-        ir.assert_structural_equal(After, _normalize_expected(Expected))
-
-    def test_guarded_nested(self):
-        """Nested guarded loops: inner guarded loop lives inside outer's then-branch.
-
-        Verifies iter_args thread correctly through both levels of IfStmt phi.
-        """
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for _i in pl.parallel(8, chunk=4, chunk_policy="guarded"):
-                        for _j in pl.parallel(3, chunk=2, chunk_policy="guarded"):
-                            x = pl.add(x, 1.0)
-                return x
-
-        After = self._split_and_simplify(Input)
-
-        @pl.program
-        class Expected:
-            @pl.function(strict_ssa=True)
-            def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i_out, (x_outer,) in pl.parallel(
-                        2, init_values=(x_0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                    ):
-                        for i_in, (x_inner,) in pl.parallel(
-                            4,
-                            init_values=(x_outer,),
-                            attrs={"loop_origin": pl.LoopOrigin.ChunkInner},
-                        ):
-                            if i_out * 4 + i_in < 8:
-                                for j_out, (x_j_outer,) in pl.parallel(
-                                    2,
-                                    init_values=(x_inner,),
-                                    attrs={"loop_origin": pl.LoopOrigin.ChunkOuter},
-                                ):
-                                    for j_in, (x_j_inner,) in pl.parallel(
-                                        2,
-                                        init_values=(x_j_outer,),
-                                        attrs={"loop_origin": pl.LoopOrigin.ChunkInner},
-                                    ):
-                                        if j_out * 2 + j_in < 3:
-                                            x_5: pl.Tensor[[64], pl.FP32] = pl.add(x_j_inner, 1.0)
-                                            x_j_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_5)
-                                        else:
-                                            x_j_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_j_inner)
-                                        x_j_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_j_if)
-                                    x_j_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_j_inner_rv)
-                                x_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_j_outer_rv)
-                            else:
-                                x_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner)
-                            x_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_if)
-                        x_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner_rv)
-                return x_outer_rv
-
-        ir.assert_structural_equal(After, _normalize_expected(Expected))
-
-    def test_guarded_negative_step(self):
-        """Descending chunked range: guard uses `idx > stop` since step < 0.
-
-        Regression test: the initial implementation built the guard as `idx < stop`
-        unconditionally, which made every iteration of a descending loop a no-op.
-        """
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for _i in pl.range(10, 0, -1, chunk=4, chunk_policy="guarded"):
-                        x = pl.add(x, 1.0)
-                return x
-
-        After = self._split_and_simplify(Input)
-
-        @pl.program
-        class Expected:
-            @pl.function(strict_ssa=True)
-            def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i_out, (x_outer,) in pl.range(
-                        3, init_values=(x_0,), attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}
-                    ):
-                        for i_in, (x_inner,) in pl.range(
-                            4,
-                            init_values=(x_outer,),
-                            attrs={"loop_origin": pl.LoopOrigin.ChunkInner},
-                        ):
-                            # Original guard: 10 + (i_out*4 + i_in) * -1 > 0
-                            # Simplify rearranges stop to the left-hand side.
-                            if -10 < (i_out * 4 + i_in) * -1:
-                                x_3: pl.Tensor[[64], pl.FP32] = pl.add(x_inner, 1.0)
-                                x_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_3)
-                            else:
-                                x_if: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner)
-                            x_inner_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_if)
-                        x_outer_rv: pl.Tensor[[64], pl.FP32] = pl.yield_(x_inner_rv)
-                return x_outer_rv
-
-        ir.assert_structural_equal(After, _normalize_expected(Expected))
-
-    def test_guarded_negative_step_no_iter_args(self):
-        """Descending chunked range without iter_args: guard still uses `idx > stop`."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for _i in pl.range(10, 0, -1, chunk=4, chunk_policy="guarded"):
-                        _tmp = pl.add(x, 1.0)
-                return x
-
-        After = self._split_and_simplify(Input)
-
-        @pl.program
-        class Expected:
-            @pl.function(strict_ssa=True)
-            def main(self, x_0: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for i_out in pl.range(3, attrs={"loop_origin": pl.LoopOrigin.ChunkOuter}):
-                        for i_in in pl.range(4, attrs={"loop_origin": pl.LoopOrigin.ChunkInner}):
-                            if -10 < (i_out * 4 + i_in) * -1:
-                                _tmp: pl.Tensor[[64], pl.FP32] = pl.add(x_0, 1.0)
-                return x_0
-
-        ir.assert_structural_equal(After, _normalize_expected(Expected))
-
-    def test_guarded_origin_attrs(self):
-        """Guarded mode sets ChunkOuter/ChunkInner attrs and never emits ChunkRemainder."""
-
-        @pl.program
-        class Input:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for _i in pl.range(7, chunk=5, chunk_policy="guarded"):
-                        x = pl.add(x, 1.0)
-                return x
-
-        Before = _prepare_for_split(Input)
-        After = passes.split_chunked_loops()(Before)
-
-        # Navigate: [ScopeStmt, return]; ScopeStmt.body = outer_for (guarded is a single for)
-        stmts = _top_level_stmts(After)
-        scope = cast(ir.ScopeStmt, stmts[0])
-        outer_for = cast(ir.ForStmt, scope.body)
-        assert outer_for.attrs.get("loop_origin") == ir.LoopOrigin.ChunkOuter
-
-        inner_for = cast(ir.ForStmt, _body_stmts(outer_for.body)[0])
-        assert inner_for.attrs.get("loop_origin") == ir.LoopOrigin.ChunkInner
-
-        # No remainder loop should exist.
-        printed = python_print(After)
-        assert "ChunkRemainder" not in printed
-
-    def test_guarded_printer_omits_default(self):
-        """Printer omits `chunk_policy="guarded"` (it's the default) but prints `leading_full`."""
-
-        @pl.program
-        class Guarded:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for _i in pl.range(10, chunk=5, chunk_policy="guarded"):
-                        x = pl.add(x, 1.0)
-                return x
-
-        @pl.program
-        class LeadingFull:
-            @pl.function
-            def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
-                    for _i in pl.range(10, chunk=5, chunk_policy="leading_full"):
-                        x = pl.add(x, 1.0)
-                return x
-
-        guarded_printed = python_print(Guarded)
-        leading_printed = python_print(LeadingFull)
-        assert "chunk_policy" not in guarded_printed
-        assert 'chunk_policy="leading_full"' in leading_printed
-
-
-if __name__ == "__main__":
-    pytest.main([__file__, "-v"])
diff --git a/tests/ut/ir/transforms/test_transform_utils.py b/tests/ut/ir/transforms/test_transform_utils.py
index 0d5346de4..8fbbea739 100644
--- a/tests/ut/ir/transforms/test_transform_utils.py
+++ b/tests/ut/ir/transforms/test_transform_utils.py
@@ -152,7 +152,7 @@ def test_scope_stmt(self):
         """Collects vars from ScopeStmt body."""
         v1, s1 = _assign("s", _const(7))
         body = ir.SeqStmts([s1], _span())
-        scope = ir.InCoreScopeStmt(body=body, span=_span())
+        scope = ir.HierarchyScopeStmt(level=ir.Level.CORE_GROUP, body=body, span=_span())
         result = ir.collect_def_vars(scope)
         assert len(result) == 1
         assert result[0] is v1
diff --git a/tests/ut/language/parser/test_error_cases.py b/tests/ut/language/parser/test_error_cases.py
index 4456d93c9..f09e76197 100644
--- a/tests/ut/language/parser/test_error_cases.py
+++ b/tests/ut/language/parser/test_error_cases.py
@@ -122,29 +122,6 @@ def bad_target(n: pl.Tensor[[1], pl.INT32]) -> pl.Tensor[[1], pl.INT32]:
 
                 return result
 
-    def test_chunked_loop_requires_auto_incore(self):
-        """Test that chunked loops are rejected outside auto_incore scope."""
-        code = """
-import pypto.language as pl
-
-@pl.program
-class ChunkedLoopProgram:
-    @pl.function(type=pl.FunctionType.Orchestration)
-    def main(
-        self,
-        x: pl.Tensor[[16, 4], pl.FP32],
-        seq_lens: pl.Tensor[[16], pl.INT32],
-    ) -> pl.Tensor[[16, 4], pl.FP32]:
-        for b in pl.parallel(0, 16, 1, chunk=4, chunk_policy="leading_full"):
-            _ctx_len = pl.tensor.read(seq_lens, [b])
-        return x
-"""
-        with pytest.raises(
-            ParserSyntaxError,
-            match=r"chunk=\.\.\. loops are only valid inside with pl\.at",
-        ):
-            pl.parse_program(code)
-
     def test_unknown_tensor_operation(self):
         """Test error on unknown tensor operation."""
 
diff --git a/tests/ut/language/parser/test_scope_parsing.py b/tests/ut/language/parser/test_scope_parsing.py
index 4ff230b80..80e6ce291 100644
--- a/tests/ut/language/parser/test_scope_parsing.py
+++ b/tests/ut/language/parser/test_scope_parsing.py
@@ -133,7 +133,7 @@ def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
             scope_stmt = body
         assert isinstance(scope_stmt, ir.ScopeStmt)
         assert scope_stmt.name_hint == "my_kernel"
-        assert scope_stmt.scope_kind == ir.ScopeKind.InCore
+        assert scope_stmt.scope_kind == ir.ScopeKind.Hierarchy
 
     def test_parse_unnamed_scope_has_empty_name(self):
         """Test that unnamed scopes have empty name."""
diff --git a/tests/ut/language/test_range_unroll_kwarg.py b/tests/ut/language/test_range_unroll_kwarg.py
index 79adedc61..dcf1cbf13 100644
--- a/tests/ut/language/test_range_unroll_kwarg.py
+++ b/tests/ut/language/test_range_unroll_kwarg.py
@@ -113,7 +113,7 @@ def test_unroll_with_chunk_rejected(self):
             class _P:
                 @pl.function
                 def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
-                    with pl.at(level=pl.Level.CORE_GROUP, optimizations=[pl.auto_chunk]):
+                    with pl.at(level=pl.Level.CORE_GROUP):
                         for i in pl.range(8, chunk=4, unroll=2):
                             x = pl.add(x, 1.0)
                     return x