diff --git a/.claude/skills/testing/SKILL.md b/.claude/skills/testing/SKILL.md
index 5df96cfc2..dc3c078d5 100644
--- a/.claude/skills/testing/SKILL.md
+++ b/.claude/skills/testing/SKILL.md
@@ -7,7 +7,7 @@ description: Testing guide and pre-commit testing strategy for PTO Runtime. Use
 
 ## Test Types
 
-1. **Python unit tests (ut-py)** (`tests/ut/`): Standard pytest tests for the Python compilation pipeline and nanobind bindings. Run with `pytest tests/ut`. Tests declaring `@pytest.mark.requires_hardware[("<platform>")]` auto-skip unless `--platform` points to a matching device.
+1. **Python unit tests (ut-py)** (`tests/ut/py/`): Standard pytest tests for the Python compilation pipeline and nanobind bindings. Run with `pytest tests/ut/py`. Tests declaring `@pytest.mark.requires_hardware[("<platform>")]` auto-skip unless `--platform` points to a matching device.
 2. **C++ unit tests (ut-cpp)** (`tests/ut/cpp/`): GoogleTest-based tests for pure C++ modules. Run with `cmake -B tests/ut/cpp/build -S tests/ut/cpp && cmake --build tests/ut/cpp/build && ctest --test-dir tests/ut/cpp/build -LE requires_hardware --output-on-failure`. Hardware-required tests carry a `requires_hardware` or `requires_hardware_<platform>` ctest label and are filtered via `-LE`.
 3. **Scene tests** (`examples/{arch}/*/`, `tests/st/{arch}/*/`): End-to-end `@scene_test` classes declared inside `test_*.py`. Sim variants run cross-platform (Linux/macOS); hardware variants require the CANN toolkit and an Ascend device. Discovery is by pytest (batch) or `python test_*.py` (standalone); `#591`'s parallel orchestrator handles device bin-packing and ChipWorker reuse automatically.
 
diff --git a/docs/ci.md b/docs/ci.md
index bed71707c..d2c159c38 100644
--- a/docs/ci.md
+++ b/docs/ci.md
@@ -112,7 +112,7 @@ Three hardware tiers, applied to all test categories. See [testing.md](testing.m
 
 ## Test Sources
 
-### `tests/ut/` — Python unit tests (ut-py)
+### `tests/ut/py/` — Python unit tests (ut-py)
 
 Python unit tests. Run via pytest, filtered by `--platform` + `requires_hardware` marker.
 
diff --git a/docs/testing.md b/docs/testing.md
index 53edf17fd..0c45efa64 100644
--- a/docs/testing.md
+++ b/docs/testing.md
@@ -60,12 +60,12 @@ Three test categories:
 | Category | Abbrev | Location | Runner | Description |
 | -------- | ------ | -------- | ------ | ----------- |
 | System tests | st | `examples/`, `tests/st/` | pytest (`@scene_test`) or standalone `python test_*.py` | Full end-to-end cases (compile + run + validate) |
-| Python unit tests | ut-py | `tests/ut/` | pytest | Unit tests for nanobind-exposed and Python modules |
+| Python unit tests | ut-py | `tests/ut/py/` | pytest | Unit tests for nanobind-exposed and Python modules |
 | C++ unit tests | ut-cpp | `tests/ut/cpp/` | ctest (GoogleTest) | Unit tests for pure C++ modules |
 
 ### Choosing ut-py vs ut-cpp
 
-If a module is exposed via nanobind (used by both C++ and Python), test in **ut-py** (`tests/ut/`).
+If a module is exposed via nanobind (used by both C++ and Python), test in **ut-py** (`tests/ut/py/`).
 If a module is pure C++ with no Python binding, test in **ut-cpp** (`tests/ut/cpp/`).
 
 ## Scene Test CLI Options
@@ -384,9 +384,18 @@ conftest.py            # Root: --platform/--device options, ST fixtures
 
 ### C++ Unit Tests (`tests/ut/cpp/`)
 
-GoogleTest-based tests for shared components (`src/common/task_interface/` and `src/{arch}/runtime/common/`):
+See [ut-test-suite.md](ut-test-suite.md) for the full per-file coverage
+reference (370+ test cases across 52 files).
 
-- `test_data_type.cpp` — DataType enum, get_element_size(), get_dtype_name()
+GoogleTest-based tests organized by component:
+
+| Subdirectory | Component under test |
+| ------------ | -------------------- |
+| `pto2_a2a3/` | PTO2 a2a3 on-chip runtime (`src/a2a3/runtime/tensormap_and_ringbuffer/`) |
+| `pto2_a5/` | PTO2 a5 on-chip runtime (`src/a5/runtime/tensormap_and_ringbuffer/`) |
+| `hierarchical/` | Host-side hierarchical runtime (`src/common/hierarchical/`) |
+| `types/` | Cross-cutting types (`src/common/task_interface/`, `pto_types.h`) |
+| `hardware/` | Tests requiring Ascend hardware (`comm_hccl`, etc.) |
 
 ```bash
 cmake -B tests/ut/cpp/build -S tests/ut/cpp
@@ -394,7 +403,7 @@ cmake --build tests/ut/cpp/build
 ctest --test-dir tests/ut/cpp/build --output-on-failure
 ```
 
-### Python Unit Tests (`tests/ut/`)
+### Python Unit Tests (`tests/ut/py/`)
 
 Tests for the nanobind extension and the Python build pipeline:
 
@@ -403,10 +412,10 @@ Tests for the nanobind extension and the Python build pipeline:
 
 ```bash
 # No-hardware runner (hw tests auto-skip, no-hw tests run)
-pytest tests/ut
+pytest tests/ut/py
 
 # a2a3 hardware runner (no-hw tests skip, hw + a2a3-specific tests run)
-pytest tests/ut --platform a2a3
+pytest tests/ut/py --platform a2a3
 ```
 
 ### Examples (`examples/{arch}/`)
@@ -434,21 +443,26 @@ Hardware-only scene tests for large-scale and feature-rich scenarios that are to
 
 ### New C++ Unit Test
 
-Add a new test file to `tests/ut/cpp/` and register it in `tests/ut/cpp/CMakeLists.txt`:
+Add a new test file to the appropriate subdirectory under `tests/ut/cpp/` and register it in `tests/ut/cpp/CMakeLists.txt`:
+
+| Component | Subdirectory | Helper function |
+| --------- | ------------ | --------------- |
+| PTO2 a2a3 header-only | `pto2_a2a3/` | `add_a2a3_pto2_test` |
+| PTO2 a2a3 runtime-linked | `pto2_a2a3/` | `add_a2a3_pto2_runtime_test` |
+| PTO2 a5 | `pto2_a5/` | `add_a5_pto2_test` |
+| Hierarchical host runtime | `hierarchical/` | `add_hierarchical_test` |
+| Task interface types | `types/` | `add_task_interface_test` |
+| Hardware (CANN) | `hardware/` | `add_comm_api_test` |
 
 ```cmake
-add_executable(test_my_component
-    test_my_component.cpp
-    test_stubs.cpp
+# Example: header-only PTO2 a2a3 test
+add_a2a3_pto2_test(test_my_component pto2_a2a3/test_my_component.cpp)
+
+# Example: runtime-linked PTO2 a2a3 test
+add_a2a3_pto2_runtime_test(test_my_component
+    SOURCES pto2_a2a3/test_my_component.cpp
+    EXTRA_SOURCES ${PTO2_RUNTIME_SOURCES}
 )
-target_include_directories(test_my_component PRIVATE ${COMMON_DIR} ${TMR_RUNTIME_DIR} ${PLATFORM_INCLUDE_DIR})
-target_link_libraries(test_my_component gtest_main)
-add_test(NAME test_my_component COMMAND test_my_component)
-
-# If hardware required:
-# set_tests_properties(test_my_component PROPERTIES LABELS "requires_hardware")
-# If specific platform required:
-# set_tests_properties(test_my_component PROPERTIES LABELS "requires_hardware_a2a3")
 ```
 
 #### C++ hardware tests needing NPU devices
diff --git a/docs/ut-test-suite.md b/docs/ut-test-suite.md
new file mode 100644
index 000000000..c1c66d208
--- /dev/null
+++ b/docs/ut-test-suite.md
@@ -0,0 +1,372 @@
+# Unit Test Suite Reference
+
+Comprehensive reference for the unit tests under `tests/ut/`. For build
+commands, hardware classification, and CI integration see
+[testing.md](testing.md) and [ci.md](ci.md).
+
+## Directory Layout
+
+```text
+tests/ut/
+├── cpp/                              # C++ GoogleTest binaries (CMake)
+│   ├── CMakeLists.txt                # Build orchestration and helper functions
+│   ├── test_helpers.h                # Shared test utilities
+│   ├── stubs/
+│   │   └── test_stubs.cpp            # Platform-abstraction stubs (logging, asserts)
+│   ├── hierarchical/                 # Host-side hierarchical runtime (L0-L6)
+│   │   ├── test_tensormap.cpp
+│   │   ├── test_ring.cpp
+│   │   ├── test_scope.cpp
+│   │   ├── test_orchestrator.cpp
+│   │   ├── test_scheduler.cpp
+│   │   └── test_worker_manager.cpp
+│   ├── platform/                     # Platform abstraction layer (sim variant)
+│   │   ├── test_platform_memory_allocator.cpp
+│   │   └── test_platform_host_log.cpp
+│   ├── types/                        # Cross-cutting ABI-contract types
+│   │   ├── test_child_memory.cpp
+│   │   ├── test_pto_types.cpp
+│   │   └── test_tensor.cpp
+│   ├── pto2_a2a3/                    # PTO2 on-chip runtime (A2A3 architecture)
+│   │   ├── test_a2a3_pto2_fatal.cpp
+│   │   ├── test_core_types.cpp
+│   │   ├── test_dispatch_payload.cpp
+│   │   ├── test_handshake.cpp
+│   │   ├── test_submit_types.cpp
+│   │   ├── test_ring_buffer.cpp
+│   │   ├── test_ring_buffer_edge.cpp
+│   │   ├── test_tensormap_edge.cpp
+│   │   ├── test_ready_queue.cpp
+│   │   ├── test_scheduler_state.cpp
+│   │   ├── test_scheduler_edge.cpp
+│   │   ├── test_shared_memory.cpp
+│   │   ├── test_boundary_edge.cpp
+│   │   ├── test_coupling.cpp
+│   │   ├── test_coupling_stub.cpp
+│   │   ├── test_runtime_graph.cpp
+│   │   ├── test_runtime_lifecycle.cpp
+│   │   ├── test_runtime_status.cpp
+│   │   ├── test_orchestrator_submit.cpp
+│   │   └── test_orchestrator_fatal.cpp
+│   ├── pto2_a5/                      # PTO2 on-chip runtime (A5 architecture)
+│   │   └── test_a5_pto2_fatal.cpp
+│   └── hardware/                     # Hardware-gated tests (CANN required)
+│       └── test_hccl_comm.cpp
+└── py/                               # Python pytest-based tests
+    ├── conftest.py                   # Fixtures, sys.path setup
+    ├── test_elf_parser.py
+    ├── test_env_manager.py
+    ├── test_kernel_compiler.py
+    ├── test_runtime_compiler.py
+    ├── test_toolchain.py
+    ├── test_toolchain_setup.py
+    ├── test_task_interface.py
+    ├── test_runtime_builder.py
+    ├── test_chip_worker.py
+    ├── test_hostsub_fork_shm.py
+    ├── test_worker/                  # Worker subsystem tests
+    │   ├── test_host_worker.py
+    │   ├── test_bootstrap_channel.py
+    │   ├── test_bootstrap_context_hw.py
+    │   ├── test_bootstrap_context_sim.py
+    │   ├── test_error_propagation.py
+    │   ├── test_group_task.py
+    │   ├── test_l4_recursive.py
+    │   ├── test_mailbox_atomics.py
+    │   ├── test_multi_worker.py
+    │   ├── test_platform_comm.py
+    │   ├── test_worker_distributed_hw.py
+    │   └── test_worker_distributed_sim.py
+```
+
+## Organization Principles
+
+### Subdirectory-per-component
+
+C++ tests are grouped by the source component they exercise:
+
+| Subdirectory | Source under test | CMake helper |
+| ------------ | ----------------- | ------------ |
+| `hierarchical/` | `src/common/hierarchical/` | `add_hierarchical_test` |
+| `platform/` | `src/a2a3/platform/` (sim variant) | inline targets |
+| `types/` | `src/common/task_interface/` | `add_task_interface_test` |
+| `pto2_a2a3/` | `src/a2a3/runtime/tensormap_and_ringbuffer/` | `add_a2a3_pto2_test` / `add_a2a3_pto2_runtime_test` |
+| `pto2_a5/` | `src/a5/runtime/tensormap_and_ringbuffer/` | `add_a5_pto2_test` |
+| `hardware/` | HCCL comm backend (needs CANN) | `add_comm_api_test` |
+
+Python tests are grouped by functional area: build infrastructure
+(compilers, toolchain, ELF parsing), nanobind bindings, and the worker
+subsystem.
+
+### Header-only vs runtime-linked
+
+PTO2 tests come in two flavors:
+
+- **Header-only** (`add_a2a3_pto2_test`): compile against orchestration/
+  runtime headers only. No `.cpp` from the runtime is linked. Used for
+  type-layout, constant, and API-contract tests.
+- **Runtime-linked** (`add_a2a3_pto2_runtime_test`): link the real
+  `pto_ring_buffer.cpp`, `pto_shared_memory.cpp`, `pto_scheduler.cpp`,
+  `pto_tensormap.cpp` (and optionally `pto_orchestrator.cpp`,
+  `pto_runtime2.cpp`). Used for behavioral and integration tests.
+
+### Hardware gating
+
+All tests default to `no_hardware` (runnable on standard CI runners). Tests
+that need Ascend hardware are gated by:
+
+- **C++**: `SIMPLER_ENABLE_HARDWARE_TESTS` CMake option + ctest labels
+  (`requires_hardware_a2a3`).
+- **Python**: `@pytest.mark.requires_hardware` / `requires_hardware("a2a3")`
+  markers.
+
+### Test-design conventions
+
+- **AAA pattern**: Arrange-Act-Assert structure in each test.
+- **Fixtures over globals**: GoogleTest fixtures (`TEST_F`) manage per-test
+  state; pytest fixtures handle setup/teardown.
+- **Stubs for platform isolation**: `stubs/test_stubs.cpp` provides logging,
+  assertion, and timer stubs so on-chip runtime code compiles on x86/macOS
+  without CANN dependencies.
+- **Edge-case files**: Files named `*_edge.cpp` focus on boundary conditions,
+  concurrency stress, and design-contract verification.
+
+## Test Design Philosophy
+
+The suite targets three goals:
+
+1. **ABI contract verification** — `sizeof`, `alignof`, field offsets, and
+   enum values are checked with `static_assert` and runtime assertions.
+   This catches silent layout drift when headers change.
+
+2. **Component isolation** — each test exercises one module with minimal
+   dependencies. Coupling tests (`test_coupling.cpp`,
+   `test_coupling_stub.cpp`) explicitly measure and document inter-component
+   dependencies.
+
+3. **Bug-candidate documentation** — edge-case tests encode known defects
+   and design tradeoffs as executable tests. When a test documents a real
+   src defect, it is preserved as a regression barrier. When a test
+   documents intentional design (e.g., LIFO dispatch order), it serves as
+   a contract anchor.
+
+## Coverage Map
+
+### C++ — Hierarchical Runtime (`hierarchical/`)
+
+Source: `src/common/hierarchical/`
+
+| File | Tests | What it covers |
+| ---- | ----- | -------------- |
+| `test_tensormap.cpp` | 4 | Insert, lookup, overwrite, erase by task ID. Compound keys (pointer + worker ID). |
+| `test_ring.cpp` | 5+ | Slot allocation monotonicity, heap slab alignment, FIFO reclamation, allocation bounds, back-pressure with small heap (8 KiB). |
+| `test_scope.cpp` | 5 | Scope depth tracking, begin/end pairing, nested scopes, task registration and release callbacks, empty scope handling. |
+| `test_orchestrator.cpp` | 1+ | Wiring TensorMap + Ring + Scope + ReadyQueues into a full Orchestrator. Independent-task readiness detection. |
+| `test_scheduler.cpp` | 2+ | MockWorker-based dispatch verification. Single-task and task-group dispatch through Scheduler + WorkerManager integration. |
+| `test_worker_manager.cpp` | 4+ | Worker pool lifecycle (THREAD mode), idle worker selection, dispatch, group dispatch. CountingWorker tracks run() calls. |
+
+### C++ — Platform Abstraction (`platform/`)
+
+Source: `src/a2a3/platform/`
+
+| File | Tests | What it covers |
+| ---- | ----- | -------------- |
+| `test_platform_memory_allocator.cpp` | 4 | Sim memory allocator: allocation tracking, multi-allocation, nullptr safety, untracked-pointer handling. |
+| `test_platform_host_log.cpp` | 3+ | HostLogger singleton: level filtering (`is_enabled`), env-var parsing (`PTO_LOG_LEVEL`), `reinitialize()` behavior. |
+
+### C++ — Cross-cutting Types (`types/`)
+
+Source: `src/common/task_interface/`
+
+| File | Tests | What it covers |
+| ---- | ----- | -------------- |
+| `test_child_memory.cpp` | 3 | `ContinuousTensor` ABI layout (`sizeof == 40`), `child_memory` bit field, blob serialization roundtrip. |
+| `test_pto_types.cpp` | 5+ | `TaskOutputTensors` init/materialize/get_ref/max-outputs, `Arg` tensor/scalar storage, `add_scalars_i32` zero-extension, `copy_scalars_from`. |
+| `test_tensor.cpp` | 5+ | Segment intersection logic (overlapping, touching, disjoint, zero-length), `make_tensor_external()` factory, cache-line layout coupling. |
+
+### C++ — PTO2 A2A3 On-chip Runtime (`pto2_a2a3/`)
+
+Source: `src/a2a3/runtime/tensormap_and_ringbuffer/`
+
+#### API and type contracts (header-only)
+
+| File | Tests | What it covers |
+| ---- | ----- | -------------- |
+| `test_a2a3_pto2_fatal.cpp` | 3+ | Fatal-path reporting through `pto2_orchestration_api.h`. Fake runtime + va_list formatting. |
+| `test_core_types.cpp` | 5 | `PTO2TaskId` encode/extract (ring in upper 32, local in lower 32), roundtrip, `PTO2TaskSlotState` size (64 bytes), `PTO2_ALIGN_UP` macro. |
+| `test_dispatch_payload.cpp` | 5+ | `PTO2DispatchPayload` 64-byte alignment, SPMD context index constants, `LocalContext`/`GlobalContext` field read/write. |
+| `test_handshake.cpp` | 4+ | Handshake protocol macros: `MAKE_ACK_VALUE`/`MAKE_FIN_VALUE`, `EXTRACT_TASK_ID`/`EXTRACT_TASK_STATE`, bit-31 state encoding, reserved task IDs. |
+| `test_submit_types.cpp` | 3+ | `pto2_subtask_active()` bitmask (AIC, AIV0, AIV1), `pto2_active_mask_to_shape()`, `pto2_mixed_kernels_to_active_mask()`. |
+| `test_runtime_status.cpp` | 9 | `pto2_runtime_status()`: zero codes, single-error negation, precedence rules (orch > sched), pass-through for already-negative codes, range non-overlap. |
+
+#### Ring buffer and memory allocation (runtime-linked)
+
+| File | Tests | What it covers |
+| ---- | ----- | -------------- |
+| `test_ring_buffer.cpp` | 10+ | `PTO2TaskAllocator` init, state queries, window size, heap allocation, FIFO reclamation, wrap-guard boundary. |
+| `test_ring_buffer_edge.cpp` | 10+ | Edge cases: wrap-guard at `tail==alloc_size`, fragmentation reporting (`max` not `sum`), zero-size allocation, exact-heap-size allocation, oversized allocation, window saturation, slot mapping, task ID near INT32_MAX. `DepListPool` edge cases: contract violation, prepend chain, high-water mark, overflow error code. |
+
+#### TensorMap (runtime-linked)
+
+| File | Tests | What it covers |
+| ---- | ----- | -------------- |
+| `test_tensormap_edge.cpp` | 15+ | Bug-candidate documentation: `check_overlap()` dimension mismatch, lookup saturation (16-producer limit), pool exhaustion, ABA in `cleanup_retired()`, `copy_from_tensor` zero-padding. Edge cases: 0-dim tensors, max-dim tensors, zero-length shapes. |
+
+#### Scheduler and ready queue (runtime-linked)
+
+| File | Tests | What it covers |
+| ---- | ----- | -------------- |
+| `test_ready_queue.cpp` | 17 | `ReadyQueue` MPMC: empty pop, single push/pop, FIFO ordering, capacity limit, slot reuse, batch push/pop, size accuracy. Multi-threaded: 2P/2C and 1P/4C stress. `LocalReadyBuffer` LIFO: reset, ordering, overflow. |
+| `test_scheduler_state.cpp` | 5+ | `init_slot()` helper, `check_and_handle_consumed` transitions (COMPLETED to CONSUMED), fanin/fanout reference counting. |
+| `test_scheduler_edge.cpp` | 25+ | `ReadyQueue` edge cases: interleaved push/pop, exact-capacity fill/drain, relaxed-ordering size guard, high-contention stress (4P/4C, 5000 items). `LocalReadyBuffer` LIFO dispatch order, overflow, null backing. `SharedMem` edge: zero window size, corruption detection, undersized buffer, region non-overlap, header alignment. `TaskState` lifecycle: PENDING to CONSUMED, simultaneous subtask completion, fanin/fanout exactly-once semantics, invalid transitions. |
+
+#### Shared memory (runtime-linked)
+
+| File | Tests | What it covers |
+| ---- | ----- | -------------- |
+| `test_shared_memory.cpp` | 6+ | `PTO2SharedMemoryHandle` create/destroy, ownership, header init values, per-ring independence, pointer alignment (`PTO2_ALIGN_SIZE`), `calculate_size()`. |
+
+#### Boundary and stress tests (runtime-linked)
+
+| File | Tests | What it covers |
+| ---- | ----- | -------------- |
+| `test_boundary_edge.cpp` | 17+ | `ReadyQueue` stress: 8P/8C, rapid fill/drain cycles, batch contention. `TaskAllocator` re-init: reset counter, heap, error state, multi-cycle, stale `last_alive`. Sequence wrap near INT64_MAX: single, fill/drain, interleaved, batch, concurrent. `SharedMemory` concurrency: per-ring isolation, atomic increment, `orchestrator_done` race, monotonic advancement, validate after concurrent writes. |
+
+#### Coupling analysis (runtime-linked)
+
+| File | Tests | What it covers |
+| ---- | ----- | -------------- |
+| `test_coupling.cpp` | 4+ | Architectural coupling detection: whether components can operate in isolation. `TMRSystem` full init/destroy measuring dependency graph. |
+| `test_coupling_stub.cpp` | 14 | `DepPool` stub isolation: reclaim below/at interval. Scheduler without orchestrator: init/destroy, standalone `ReadyQueue`, fanin release, non-profiling path, mixed-task completion. `TensorMap` link decoupling: builds without `orchestrator.cpp`, orchestrator pointer never dereferenced in hot path. Compile-time include coupling: `RingBuffer` to `Scheduler`, duplicated slot-mask formula, `PTO2_MAX_RING_DEPTH` in 4 components, transitive includes. Profiling behavior: CAS guard in profiling vs atomicity in non-profiling. |
+
+#### Orchestrator (runtime-linked)
+
+| File | Tests | What it covers |
+| ---- | ----- | -------------- |
+| `test_orchestrator_submit.cpp` | 12 | `set_scheduler`, `alloc_tensors` validation (empty/scalar/input args mark fatal), output-only materialization, post-fatal short-circuit, submit with error args, pure-input submit, output materialization, `orchestrator_done` idempotency. |
+| `test_orchestrator_fatal.cpp` | 11 | Fatal error latching: initial state, `report_fatal` sets local flag + shared code, second report does not overwrite, `ERROR_NONE` does not latch, all 9 error codes latch correctly, null/empty/varargs format strings, status helper reads latched code. |
+
+#### Runtime lifecycle (runtime-linked)
+
+| File | Tests | What it covers |
+| ---- | ----- | -------------- |
+| `test_runtime_lifecycle.cpp` | 12 | `pto2_runtime_create_custom` initialization, orchestrator-to-scheduler connection, default creation, null SM handle, caller-allocated buffers, null-safe destroy, heap release, `set_mode`, ops table population, `is_fatal` / `report_fatal`. |
+
+#### Runtime graph — host_build_graph (runtime-linked)
+
+| File | Tests | What it covers |
+| ---- | ----- | -------------- |
+| `test_runtime_graph.cpp` | 10 | `RuntimeGraph`: monotonic task IDs, field storage, successor updates (fanout/fanin), ready-task detection, diamond DAG, linear chain, fanout/fanin consistency, max-task limit, tensor-pair management, function binary address mapping. |
+
+### C++ — PTO2 A5 On-chip Runtime (`pto2_a5/`)
+
+Source: `src/a5/runtime/tensormap_and_ringbuffer/`
+
+| File | Tests | What it covers |
+| ---- | ----- | -------------- |
+| `test_a5_pto2_fatal.cpp` | 3 | API short-circuit after fatal, explicit fatal routing through ops table, `alloc_tensor` overflow reports invalid args instead of asserting. |
+
+### C++ — Hardware Tests (`hardware/`)
+
+Gated by `SIMPLER_ENABLE_HARDWARE_TESTS=ON`. Labeled
+`requires_hardware_a2a3`.
+
+| File | Tests | What it covers |
+| ---- | ----- | -------------- |
+| `test_hccl_comm.cpp` | 3+ | HCCL backend lifecycle: `dlopen(libhost_runtime.so)`, comm init/alloc/query/destroy. CTest resource allocation for 2-device tests. |
+
+### Python — Build Infrastructure (`py/`)
+
+| File | Tests | What it covers |
+| ---- | ----- | -------------- |
+| `test_elf_parser.py` | 3+ | ELF64 and Mach-O `.text` section extraction from raw struct-packed binaries. `_extract_cstring`, `extract_text_section`. |
+| `test_env_manager.py` | 5+ | `env_manager.get()`, `ensure()`, caching behavior, error on unset/empty vars. Uses `monkeypatch` for env isolation. |
+| `test_kernel_compiler.py` | 4+ | Platform include dirs (a2a3 vs a5), orchestration include dirs. Mock `ASCEND_HOME_PATH` fixture. |
+| `test_runtime_compiler.py` | 4+ | `BuildTarget` CMake arg generation, `root_dir` absoluteness, `binary_name`, `RuntimeCompiler` singleton reset. |
+| `test_toolchain.py` | 5+ | `_parse_compiler_env()` for conda flags, `GxxToolchainCmakeArgs` (plain/conda env, quoted paths, CMAKE_C/CXX_FLAGS). |
+| `test_toolchain_setup.py` | 18 | CCEC toolchain compile flags (a2a3/a5, aic/aiv), unknown platform, missing compiler. Gxx15 toolchain (`__DAV_VEC__`/`__DAV_CUBE__` defines, `__CPU_SIM`). Gxx/Aarch64Gxx cmake args, env vars, cross-compile. `ToolchainType` enum values. |
+| `test_runtime_builder.py` | 16 | Runtime discovery (real project tree), config resolution, missing/empty dirs, sorted output. `get_binaries()` error handling, compiler invocation count, path resolution, error propagation. Integration: real compilation produces non-empty `.so` files. |
+
+### Python — Nanobind and Type Contracts (`py/`)
+
+| File | Tests | What it covers |
+| ---- | ----- | -------------- |
+| `test_task_interface.py` | 10+ | `DataType` enum ABI values (FLOAT32, FLOAT16, INT32, ...), `get_element_size()` parametrized, nanobind `_task_interface` extension (`ContinuousTensor`, `TaskArgs`, `ChipStorageTaskArgs`), torch integration. |
+
+### Python — ChipWorker and Fork/SHM (`py/`)
+
+| File | Tests | What it covers |
+| ---- | ----- | -------------- |
+| `test_chip_worker.py` | 11 | `ChipCallConfig` defaults/setters/repr. `ChipWorker` state machine: uninitialized state, run-before-set-device, set-device-before-init, reset/finalize idempotency, init-after-finalize, nonexistent lib. Python import verification. |
+| `test_hostsub_fork_shm.py` | 6 | `SharedMemory` cross-fork access. `torch.share_memory_()` mutations across fork. Callable registry in forked child. Mailbox state machine (IDLE/TASK_READY/TASK_DONE cycling). Parallel wall-time verification (3 SubWorkers). Threading after fork. |
+
+### Python — Worker Subsystem (`py/test_worker/`)
+
+| File | Tests | What it covers |
+| ---- | ----- | -------------- |
+| `test_host_worker.py` | 18 | Worker lifecycle (init/close, context manager, register-after-init). Single sub-task execution and multiple runs. `submit_sub()` return type. Scope management (run-managed, user-nested, 3-deep nesting). `alloc()` tensor validity, dependency wiring, unused-freed, no-leak across runs. Sub-callable receives tensor metadata, scalar, empty args. |
+| `test_bootstrap_channel.py` | 7 | `BootstrapChannel` state machine: fresh=IDLE, write success/error fields, reset transition, cross-process fork, buffer-ptr overflow, error message truncation. |
+| `test_bootstrap_context_hw.py` | 1 | 2-rank hardware smoke: `ChipWorker.bootstrap_context` populates device_ctx, window_base, window_size, buffer_ptrs. |
+| `test_bootstrap_context_sim.py` | 4 | 2-rank sim bootstrap, `load_from_host` roundtrip, channel SUCCESS fields, invalid-placement error publishing. |
+| `test_error_propagation.py` | 5 | Sub-worker exception surfacing (type/message preserved), missing callable_id, failure-does-not-wedge (next run succeeds), post-failure submit re-raises, L4-chained failure surfaces with layer prefixes. |
+| `test_group_task.py` | 3 | `submit_sub_group` with 2 args dispatches to 2 SubWorkers, single-arg group, group-then-dependent-task ordering. |
+| `test_l4_recursive.py` | 13 | L4 lifecycle (no children, with L3 child, context manager). Validation (level check, add-after-init, initialized-child). L4-to-L3 dispatch (single, triple, with own subs). Multiple runs no-leak. L3 child with multiple subs. L3 own orchestrator. Generalized `_Worker` level parameter. |
+| `test_mailbox_atomics.py` | 6 | `_mailbox_store_i32`/`load_i32` roundtrip (positive, negative, offset). Cross-process visibility via `MAP_SHARED`. Release/acquire ordering: payload visible when state observed. L3 sub-worker dispatch roundtrip. |
+| `test_multi_worker.py` | 3 | Two-worker parallel execution with thread-local isolation. Sequential task stress (20 tasks, 1 SubWorker). 20 tasks across 2 SubWorkers, all complete exactly once. |
+| `test_platform_comm.py` | 1 | 2-rank hardware smoke: `comm_init` to `comm_destroy` lifecycle (barrier failure tolerated per HCCL 507018). |
+| `test_worker_distributed_hw.py` | 1 | 2-rank hardware smoke: `Worker(chip_bootstrap_configs=...)` populates `chip_contexts` with device_ctx, window_base, buffer_ptrs per rank. No `comm_barrier`. |
+| `test_worker_distributed_sim.py` | 5 | Worker-level chip bootstrap on sim: happy-path `chip_contexts` population + `/dev/shm` leak check, pre-init access rejection, invalid placement error path + cleanup, level-below-3 rejection, config/device_ids length mismatch. |
+
+## Test Counts Summary
+
+| Category | Files | Approx. test cases |
+| -------- | ----- | ------------------ |
+| C++ hierarchical | 6 | 20+ |
+| C++ platform | 2 | 7+ |
+| C++ types | 3 | 13+ |
+| C++ PTO2 A2A3 | 19 | 180+ |
+| C++ PTO2 A5 | 1 | 3 |
+| C++ hardware | 1 | 3+ |
+| Python build infra | 6 | 50+ |
+| Python nanobind | 1 | 10+ |
+| Python ChipWorker/fork | 2 | 17 |
+| Python worker subsystem | 12 | 67+ |
+| **Total** | **53** | **371+** |
+
+## Infrastructure
+
+### CMake Helper Functions
+
+| Function | Linker scope | Use for |
+| -------- | ------------ | ------- |
+| `add_hierarchical_test(name src)` | Full hierarchical runtime sources | Tests under `hierarchical/` |
+| `add_task_interface_test(name src)` | Header-only (`task_interface/`) | ABI-contract tests under `types/` |
+| `add_a2a3_pto2_test(name src)` | Header-only (orchestration + runtime headers) | PTO2 type/constant tests |
+| `add_a2a3_pto2_runtime_test(name SOURCES ... EXTRA_SOURCES ...)` | Stubs + selected runtime `.cpp` files | Behavioral PTO2 tests |
+| `add_a5_pto2_test(name src)` | Header-only (A5 orchestration + runtime) | A5-specific tests |
+| `add_comm_api_test(name src)` | CANN `libascendcl` + `dlopen` | Hardware-gated HCCL tests |
+
+### Platform Stubs (`stubs/test_stubs.cpp`)
+
+Provides userspace implementations for symbols that on-chip runtime code
+expects from the AICPU environment:
+
+- `unified_log_{error,warn,info,debug,always}` — logging (stderr)
+- `get_sys_cnt_aicpu()` — timer stub (returns 0)
+- `get_stacktrace()` — stack trace (returns empty string)
+- `assert_impl()` — assertion handler (throws `AssertionError`)
+
+This allows the full runtime `.cpp` files to compile and link on
+x86_64/aarch64/macOS without CANN.
+
+### Python conftest (`py/conftest.py`)
+
+- Adds `PROJECT_ROOT` to `sys.path` for `import simpler_setup`
+- Adds `python/` for `from simpler import env_manager`
+- Adds `python/simpler/` for legacy `import env_manager` compatibility
+- Provides `project_root` fixture returning the `PROJECT_ROOT` `Path`
+
+### Test Helpers (`test_helpers.h`)
+
+- `test_ready_queue_init()` — initialize a `ReadyQueue` with
+  caller-provided buffer and arbitrary start sequence number
diff --git a/simpler_setup/runtime_compiler.py b/simpler_setup/runtime_compiler.py
index 4de30ec2b..519ef0726 100644
--- a/simpler_setup/runtime_compiler.py
+++ b/simpler_setup/runtime_compiler.py
@@ -78,6 +78,11 @@ def get_instance(cls, platform: str = "a2a3") -> "RuntimeCompiler":
             cls._instances[platform] = cls(platform)
         return cls._instances[platform]
 
+    @classmethod
+    def reset_instances(cls) -> None:
+        """Clear the singleton cache. Intended for test isolation."""
+        cls._instances.clear()
+
     def __init__(self, platform: str = "a2a3"):
         self.platform = platform
         self.project_root = PROJECT_ROOT
diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt
index ced571d80..5fec9c8e0 100644
--- a/tests/ut/cpp/CMakeLists.txt
+++ b/tests/ut/cpp/CMakeLists.txt
@@ -74,6 +74,49 @@ if(NOT GTEST_LIB OR NOT GTEST_MAIN_LIB)
     set(GTEST_INCLUDE_DIRS "")  # include dirs are carried by the gtest target
 endif()
 
+# ---------------------------------------------------------------------------
+# PTO2 runtime sources and stubs for a2a3 ring-buffer / tensormap tests
+# ---------------------------------------------------------------------------
+set(A2A3_RUNTIME_DIR ${CMAKE_SOURCE_DIR}/../../../src/a2a3/runtime/tensormap_and_ringbuffer/runtime)
+set(PTO2_STUB_SOURCES ${CMAKE_SOURCE_DIR}/stubs/test_stubs.cpp)
+set(PTO2_RUNTIME_SOURCES
+    ${A2A3_RUNTIME_DIR}/pto_ring_buffer.cpp
+    ${A2A3_RUNTIME_DIR}/pto_shared_memory.cpp
+    ${A2A3_RUNTIME_DIR}/pto_scheduler.cpp
+    ${A2A3_RUNTIME_DIR}/pto_tensormap.cpp
+)
+
+set(PTO2_COMMON_INCLUDE_DIRS
+    ${CMAKE_SOURCE_DIR}/../../../src/a2a3/runtime/tensormap_and_ringbuffer/orchestration
+    ${CMAKE_SOURCE_DIR}/../../../src/a2a3/runtime/tensormap_and_ringbuffer/runtime
+    ${CMAKE_SOURCE_DIR}/../../../src/a2a3/runtime/tensormap_and_ringbuffer/common
+    ${CMAKE_SOURCE_DIR}/../../../src/a2a3/platform/include
+    ${CMAKE_SOURCE_DIR}/../../../src/common/task_interface
+)
+
+function(add_a2a3_pto2_runtime_test name)
+    cmake_parse_arguments(ARG "" "" "SOURCES;EXTRA_SOURCES" ${ARGN})
+    set(_all_sources ${ARG_SOURCES} ${PTO2_STUB_SOURCES})
+    foreach(src ${ARG_SOURCES} ${ARG_EXTRA_SOURCES})
+        if(EXISTS ${src})
+            list(APPEND _all_sources ${src})
+        endif()
+    endforeach()
+    add_executable(${name} ${_all_sources})
+    target_include_directories(${name} PRIVATE
+        ${GTEST_INCLUDE_DIRS}
+        ${PTO2_COMMON_INCLUDE_DIRS}
+    )
+    target_compile_options(${name} PRIVATE -D_GLIBCXX_USE_CXX11_ABI=0)
+    target_link_libraries(${name} PRIVATE
+        ${GTEST_MAIN_LIB}
+        ${GTEST_LIB}
+        pthread
+    )
+    add_test(NAME ${name} COMMAND ${name})
+    set_tests_properties(${name} PROPERTIES LABELS "no_hardware")
+endfunction()
+
 # ---------------------------------------------------------------------------
 # Distributed runtime sources under test
 # ---------------------------------------------------------------------------
@@ -153,11 +196,15 @@ endfunction()
 
 enable_testing()
 
-add_hierarchical_test(test_tensormap  test_tensormap.cpp)
-add_hierarchical_test(test_ring  test_ring.cpp)
-add_hierarchical_test(test_scope      test_scope.cpp)
-add_hierarchical_test(test_orchestrator test_orchestrator.cpp)
-add_hierarchical_test(test_scheduler  test_scheduler.cpp)
+# ---------------------------------------------------------------------------
+# Hierarchical runtime tests (src/common/hierarchical/)
+# ---------------------------------------------------------------------------
+add_hierarchical_test(test_tensormap  hierarchical/test_tensormap.cpp)
+add_hierarchical_test(test_ring  hierarchical/test_ring.cpp)
+add_hierarchical_test(test_scope      hierarchical/test_scope.cpp)
+add_hierarchical_test(test_orchestrator hierarchical/test_orchestrator.cpp)
+add_hierarchical_test(test_scheduler  hierarchical/test_scheduler.cpp)
+add_hierarchical_test(test_worker_manager hierarchical/test_worker_manager.cpp)
 function(add_task_interface_test name src)
     add_executable(${name} ${src})
     target_include_directories(${name} PRIVATE
@@ -173,9 +220,160 @@ function(add_task_interface_test name src)
     add_test(NAME ${name} COMMAND ${name})
 endfunction()
 
-add_task_interface_test(test_child_memory test_child_memory.cpp)
-add_a2a3_pto2_test(test_a2a3_pto2_fatal test_a2a3_pto2_fatal.cpp)
-add_a5_pto2_test(test_a5_pto2_fatal test_a5_pto2_fatal.cpp)
+# ---------------------------------------------------------------------------
+# Types / task_interface tests (src/common/task_interface/)
+# ---------------------------------------------------------------------------
+add_task_interface_test(test_child_memory types/test_child_memory.cpp)
+
+# ---------------------------------------------------------------------------
+# PTO2 A2A3 tests (src/a2a3/runtime/tensormap_and_ringbuffer/)
+# ---------------------------------------------------------------------------
+add_a2a3_pto2_test(test_a2a3_pto2_fatal pto2_a2a3/test_a2a3_pto2_fatal.cpp)
+add_a2a3_pto2_test(test_runtime_status  pto2_a2a3/test_runtime_status.cpp)
+add_a2a3_pto2_test(test_core_types      pto2_a2a3/test_core_types.cpp)
+add_a2a3_pto2_test(test_dispatch_payload pto2_a2a3/test_dispatch_payload.cpp)
+add_a2a3_pto2_test(test_handshake       pto2_a2a3/test_handshake.cpp)
+add_a2a3_pto2_test(test_submit_types    pto2_a2a3/test_submit_types.cpp)
+add_a2a3_pto2_runtime_test(test_pto_types
+    SOURCES types/test_pto_types.cpp
+)
+add_a2a3_pto2_runtime_test(test_tensor
+    SOURCES types/test_tensor.cpp
+)
+
+# ---------------------------------------------------------------------------
+# PTO2 A5 tests (src/a5/runtime/tensormap_and_ringbuffer/)
+# ---------------------------------------------------------------------------
+add_a5_pto2_test(test_a5_pto2_fatal pto2_a5/test_a5_pto2_fatal.cpp)
+
+# PTO2 runtime-linked tests (ring buffer, tensormap, dep pool, etc.)
+add_a2a3_pto2_runtime_test(test_ring_buffer
+    SOURCES pto2_a2a3/test_ring_buffer.cpp
+    EXTRA_SOURCES ${PTO2_RUNTIME_SOURCES}
+)
+add_a2a3_pto2_runtime_test(test_ring_buffer_edge
+    SOURCES pto2_a2a3/test_ring_buffer_edge.cpp
+    EXTRA_SOURCES ${PTO2_RUNTIME_SOURCES}
+)
+add_a2a3_pto2_runtime_test(test_tensormap_edge
+    SOURCES pto2_a2a3/test_tensormap_edge.cpp
+    EXTRA_SOURCES ${PTO2_RUNTIME_SOURCES}
+)
+add_a2a3_pto2_runtime_test(test_coupling
+    SOURCES pto2_a2a3/test_coupling.cpp
+    EXTRA_SOURCES ${PTO2_RUNTIME_SOURCES} ${A2A3_RUNTIME_DIR}/pto_orchestrator.cpp
+)
+add_a2a3_pto2_runtime_test(test_coupling_stub
+    SOURCES pto2_a2a3/test_coupling_stub.cpp
+    EXTRA_SOURCES ${PTO2_RUNTIME_SOURCES}
+)
+add_a2a3_pto2_runtime_test(test_ready_queue
+    SOURCES pto2_a2a3/test_ready_queue.cpp
+    EXTRA_SOURCES ${PTO2_RUNTIME_SOURCES}
+)
+add_a2a3_pto2_runtime_test(test_scheduler_state
+    SOURCES pto2_a2a3/test_scheduler_state.cpp
+    EXTRA_SOURCES ${PTO2_RUNTIME_SOURCES}
+)
+add_a2a3_pto2_runtime_test(test_scheduler_edge
+    SOURCES pto2_a2a3/test_scheduler_edge.cpp
+    EXTRA_SOURCES ${PTO2_RUNTIME_SOURCES}
+)
+add_a2a3_pto2_runtime_test(test_boundary_edge
+    SOURCES pto2_a2a3/test_boundary_edge.cpp
+    EXTRA_SOURCES ${PTO2_RUNTIME_SOURCES}
+)
+add_a2a3_pto2_runtime_test(test_shared_memory
+    SOURCES pto2_a2a3/test_shared_memory.cpp
+    EXTRA_SOURCES ${PTO2_RUNTIME_SOURCES}
+)
+# Runtime lifecycle tests — link the full PTO2 runtime including
+# pto_runtime2.cpp + pto_orchestrator.cpp so create/destroy is exercised end-to-end.
+add_a2a3_pto2_runtime_test(test_runtime_lifecycle
+    SOURCES pto2_a2a3/test_runtime_lifecycle.cpp
+    EXTRA_SOURCES ${PTO2_RUNTIME_SOURCES}
+                  ${A2A3_RUNTIME_DIR}/pto_runtime2.cpp
+                  ${A2A3_RUNTIME_DIR}/pto_orchestrator.cpp
+)
+
+# Orchestrator submit-path tests
+add_a2a3_pto2_runtime_test(test_orchestrator_submit
+    SOURCES pto2_a2a3/test_orchestrator_submit.cpp
+    EXTRA_SOURCES ${PTO2_RUNTIME_SOURCES} ${A2A3_RUNTIME_DIR}/pto_orchestrator.cpp
+)
+
+# Orchestrator fatal-path tests — exercise the real pto2_orch_report_fatal
+# against a fully-initialized orchestrator + shared memory pair.
+add_a2a3_pto2_runtime_test(test_orchestrator_fatal
+    SOURCES pto2_a2a3/test_orchestrator_fatal.cpp
+    EXTRA_SOURCES ${PTO2_RUNTIME_SOURCES} ${A2A3_RUNTIME_DIR}/pto_orchestrator.cpp
+)
+
+# ---------------------------------------------------------------------------
+# Platform sim layer tests — only the sim variants are pure userspace and
+# can be unit-tested off-hardware.  Hardware (onboard) variants that wrap
+# rtMalloc/rtFree / device registers are exercised by the hardware CI.
+# ---------------------------------------------------------------------------
+set(A2A3_PLATFORM_SIM_DIR  ${CMAKE_SOURCE_DIR}/../../../src/a2a3/platform/sim)
+set(A2A3_PLATFORM_HOST_DIR ${CMAKE_SOURCE_DIR}/../../../src/a2a3/platform/src/host)
+set(A2A3_PLATFORM_INCLUDE  ${CMAKE_SOURCE_DIR}/../../../src/a2a3/platform/include)
+
+add_executable(test_platform_memory_allocator
+    platform/test_platform_memory_allocator.cpp
+    ${A2A3_PLATFORM_SIM_DIR}/host/memory_allocator.cpp
+    ${PTO2_STUB_SOURCES}
+)
+target_include_directories(test_platform_memory_allocator PRIVATE
+    ${GTEST_INCLUDE_DIRS}
+    ${A2A3_PLATFORM_INCLUDE}
+)
+target_compile_options(test_platform_memory_allocator PRIVATE -D_GLIBCXX_USE_CXX11_ABI=0)
+target_link_libraries(test_platform_memory_allocator PRIVATE
+    ${GTEST_MAIN_LIB}
+    ${GTEST_LIB}
+    pthread
+)
+add_test(NAME test_platform_memory_allocator COMMAND test_platform_memory_allocator)
+set_tests_properties(test_platform_memory_allocator PROPERTIES LABELS "no_hardware")
+
+add_executable(test_platform_host_log
+    platform/test_platform_host_log.cpp
+    ${A2A3_PLATFORM_HOST_DIR}/host_log.cpp
+)
+target_include_directories(test_platform_host_log PRIVATE
+    ${GTEST_INCLUDE_DIRS}
+    ${A2A3_PLATFORM_HOST_DIR}
+)
+target_compile_options(test_platform_host_log PRIVATE -D_GLIBCXX_USE_CXX11_ABI=0)
+target_link_libraries(test_platform_host_log PRIVATE
+    ${GTEST_MAIN_LIB}
+    ${GTEST_LIB}
+    pthread
+)
+add_test(NAME test_platform_host_log COMMAND test_platform_host_log)
+set_tests_properties(test_platform_host_log PROPERTIES LABELS "no_hardware")
+
+# host_build_graph Runtime test — needs the HBG runtime.cpp source
+set(HBG_RUNTIME_DIR ${CMAKE_SOURCE_DIR}/../../../src/a2a3/runtime/host_build_graph/runtime)
+add_executable(test_runtime_graph
+    pto2_a2a3/test_runtime_graph.cpp
+    ${HBG_RUNTIME_DIR}/runtime.cpp
+    ${PTO2_STUB_SOURCES}
+)
+target_include_directories(test_runtime_graph PRIVATE
+    ${GTEST_INCLUDE_DIRS}
+    ${HBG_RUNTIME_DIR}
+    ${CMAKE_SOURCE_DIR}/../../../src/a2a3/platform/include
+    ${CMAKE_SOURCE_DIR}/../../../src/common/task_interface
+)
+target_compile_options(test_runtime_graph PRIVATE -D_GLIBCXX_USE_CXX11_ABI=0)
+target_link_libraries(test_runtime_graph PRIVATE
+    ${GTEST_MAIN_LIB}
+    ${GTEST_LIB}
+    pthread
+)
+add_test(NAME test_runtime_graph COMMAND test_runtime_graph)
+set_tests_properties(test_runtime_graph PROPERTIES LABELS "no_hardware")
 
 # Hardware-gated tests.  Block is only entered when the project is configured
 # with -DSIMPLER_ENABLE_HARDWARE_TESTS=ON.  CI's no-hw `ut` job does not pass
@@ -238,5 +436,5 @@ if(SIMPLER_ENABLE_HARDWARE_TESTS)
         )
     endfunction()
 
-    add_comm_api_test(test_hccl_comm test_hccl_comm.cpp)
+    add_comm_api_test(test_hccl_comm hardware/test_hccl_comm.cpp)
 endif()
diff --git a/tests/ut/cpp/test_hccl_comm.cpp b/tests/ut/cpp/hardware/test_hccl_comm.cpp
similarity index 96%
rename from tests/ut/cpp/test_hccl_comm.cpp
rename to tests/ut/cpp/hardware/test_hccl_comm.cpp
index 858c488de..73c5cb91e 100644
--- a/tests/ut/cpp/test_hccl_comm.cpp
+++ b/tests/ut/cpp/hardware/test_hccl_comm.cpp
@@ -12,12 +12,12 @@
 /*
  * Hardware UT guarding the CANN/HCCL-private ABI coupling in comm_hccl.cpp.
  *
- * The call chain (dlopen → create_device_context → ensure_acl_ready_ctx →
- * aclrtCreateStream → comm_init → comm_alloc_windows → ...) is not the
- * interesting part — the interesting part is *what's inside* CommContext
+ * The call chain (dlopen -> create_device_context -> ensure_acl_ready_ctx ->
+ * aclrtCreateStream -> comm_init -> comm_alloc_windows -> ...) is not the
+ * interesting part -- the interesting part is *what's inside* CommContext
  * after comm_alloc_windows returns.  That struct comes from one of:
  *
- *   - MESH topology: `reinterpret_cast<CommContext*>(HCCL's return ptr)` —
+ *   - MESH topology: `reinterpret_cast<CommContext*>(HCCL's return ptr)` --
  *     our layout is *assumed* to match HCCL's internal MESH context.
  *   - RING topology: our parser reads HcclOpResParam / HcclRankRelationResV2
  *     field-by-field using offsetof against reverse-engineered struct defs.
@@ -40,7 +40,7 @@
  * gate SIMPLER_ENABLE_HARDWARE_TESTS.  Device allocation is driven by
  * CTest RESOURCE_GROUPS + --resource-spec-file.
  *
- * Linking strategy: libhost_runtime.so is dlopen'd — it is the subject
+ * Linking strategy: libhost_runtime.so is dlopen'd -- it is the subject
  * under test and mirrors how ChipWorker loads a runtime backend in
  * production.  libascendcl.so is linked directly at compile time because
  * it is generic CANN infra; going through dlsym for acl* here buys nothing
@@ -122,14 +122,14 @@ constexpr int EXIT_WINDOW_SIZE = 50;
 // the CommContext returned by HCCL (MESH reinterpret_cast) or built by our
 // RING parser actually contains the fields we expect at the offsets we
 // expect.  Failure here means our reverse-engineered CANN ABI disagrees with
-// the live HCCL build — the CANN-coupling fragility this test is here for.
+// the live HCCL build -- the CANN-coupling fragility this test is here for.
 constexpr int EXIT_CTX_MEMCPY = 55;
 constexpr int EXIT_CTX_FIELDS = 56;
 constexpr int EXIT_BARRIER = 60;
 constexpr int EXIT_DESTROY = 70;
 
 int run_rank(int rank, int nranks, int device_id, const char *rootinfo_path) {
-    // libhost_runtime.so is the subject under test — dlopen mirrors
+    // libhost_runtime.so is the subject under test -- dlopen mirrors
     // ChipWorker.  libascendcl is linked in, so acl* is available directly.
     void *host_handle = dlopen(PTO_HOST_RUNTIME_LIB_PATH, RTLD_NOW | RTLD_LOCAL);
     if (host_handle == nullptr) {
@@ -215,7 +215,7 @@ int run_rank(int rank, int nranks, int device_id, const char *rootinfo_path) {
                                host_ctx.windowsIn[rank] != local_base) {
                         fprintf(
                             stderr,
-                            "[rank %d] CommContext field mismatch — CANN ABI drift?\n"
+                            "[rank %d] CommContext field mismatch -- CANN ABI drift?\n"
                             "  got:      rankId=%u rankNum=%u winSize=%lu windowsIn[%d]=0x%lx\n"
                             "  expected: rankId=%d rankNum=%d winSize=%zu windowsIn[%d]=0x%lx\n",
                             rank, host_ctx.rankId, host_ctx.rankNum, static_cast<unsigned long>(host_ctx.winSize), rank,
diff --git a/tests/ut/cpp/test_orchestrator.cpp b/tests/ut/cpp/hierarchical/test_orchestrator.cpp
similarity index 96%
rename from tests/ut/cpp/test_orchestrator.cpp
rename to tests/ut/cpp/hierarchical/test_orchestrator.cpp
index 14919b11e..82fac02d7 100644
--- a/tests/ut/cpp/test_orchestrator.cpp
+++ b/tests/ut/cpp/hierarchical/test_orchestrator.cpp
@@ -48,7 +48,7 @@ struct OrchestratorFixture : public ::testing::Test {
 
     void TearDown() override { allocator.shutdown(); }
 
-    // Per-slot accessor — slot state lives inside the Ring now.
+    // Per-slot accessor -- slot state lives inside the Ring now.
     TaskSlotState &S(TaskSlot id) { return *allocator.slot_state(id); }
 
     // Helper: build a TaskArgs whose only tensor has the given (data, tag).
@@ -86,7 +86,7 @@ TEST_F(OrchestratorFixture, DependentTaskIsPending) {
     TaskSlot a_slot;
     rq.try_pop(a_slot);
 
-    // Task B reads INPUT at the same key — depends on A
+    // Task B reads INPUT at the same key -- depends on A
     auto args_b = single_tensor_args(0xBEEF, TensorArgType::INPUT);
     auto b = orch.submit_next_level(0xDEAD, args_b, cfg);
     EXPECT_EQ(S(b.task_slot).state.load(), TaskState::PENDING);
@@ -151,7 +151,7 @@ TEST_F(OrchestratorFixture, NoDepTagSkipsDependencyTracking) {
     TaskSlot drain_slot;
     rq.try_pop(drain_slot);
 
-    // Second task references same key but tagged NO_DEP — should be independent
+    // Second task references same key but tagged NO_DEP -- should be independent
     auto args_b = single_tensor_args(0xAAAA, TensorArgType::NO_DEP);
     auto b = orch.submit_next_level(0xDEAD, args_b, cfg);
     EXPECT_EQ(S(b.task_slot).state.load(), TaskState::READY);
@@ -215,7 +215,7 @@ TEST_F(OrchestratorFixture, OutputAutoAllocsFromHeapRing) {
 
 TEST_F(OrchestratorFixture, InoutWiresCreatorAsFanin) {
     // INOUT is the only tag that pulls in the prior writer as a fanin
-    // producer — matching L2's pto_orchestrator.cpp Step B where only
+    // producer -- matching L2's pto_orchestrator.cpp Step B where only
     // INPUT / INOUT do tensor_map.lookup. Users who want a WaW dep on
     // the alloc-slot (so its HeapRing slab stays live while they write)
     // must tag the buffer INOUT.
@@ -250,7 +250,7 @@ TEST_F(OrchestratorFixture, InoutWiresCreatorAsFanin) {
 
 TEST_F(OrchestratorFixture, OutputAndOutputExistingAreInsertOnly) {
     // Contrast with INOUT: plain OUTPUT and OUTPUT_EXISTING are pure
-    // overwrites — insert into TensorMap, no lookup, so no fanin wire
+    // overwrites -- insert into TensorMap, no lookup, so no fanin wire
     // on the prior writer. Matches L2 semantics for both tags. Users
     // who need creator lifetime must tag the buffer INOUT.
     struct Case {
diff --git a/tests/ut/cpp/test_ring.cpp b/tests/ut/cpp/hierarchical/test_ring.cpp
similarity index 96%
rename from tests/ut/cpp/test_ring.cpp
rename to tests/ut/cpp/hierarchical/test_ring.cpp
index 05152972d..7f0689b7d 100644
--- a/tests/ut/cpp/test_ring.cpp
+++ b/tests/ut/cpp/hierarchical/test_ring.cpp
@@ -129,7 +129,7 @@ TEST(Ring, SlotStateIsPointerStable) {
     TaskSlotState *p0 = a.slot_state(r0.slot);
     ASSERT_NE(p0, nullptr);
 
-    // Push many more slots through — the deque may grow/chain, but the
+    // Push many more slots through -- the deque may grow/chain, but the
     // pointer we grabbed for slot 0 has to stay valid.
     for (int i = 0; i < 1000; ++i) {
         (void)a.alloc();
@@ -227,7 +227,7 @@ TEST(Ring, ScopeDepthMapsToRingIdx) {
 }
 
 TEST(Ring, PerRingHeapsAreDistinctMmaps) {
-    // Total VA = 4 × 4 KiB; verify each ring has its own mapping.
+    // Total VA = 4 x 4 KiB; verify each ring has its own mapping.
     Ring a;
     a.init(kSmallHeap, kQuickTimeoutMs);
 
@@ -241,7 +241,7 @@ TEST(Ring, PerRingHeapsAreDistinctMmaps) {
     for (int i = 0; i < MAX_RING_DEPTH; ++i) {
         for (int j = i + 1; j < MAX_RING_DEPTH; ++j) {
             EXPECT_NE(bases[i], bases[j])
-                << "rings " << i << " and " << j << " share a mapping — expected 4 separate mmaps";
+                << "rings " << i << " and " << j << " share a mapping -- expected 4 separate mmaps";
         }
     }
 }
@@ -292,7 +292,7 @@ TEST(Ring, RingsReclaimIndependently) {
     EXPECT_EQ(r1a.ring_idx, 1);
     EXPECT_EQ(r1b.ring_idx, 1);
 
-    // Ring 0 is untouched — this must succeed instantly, not time out.
+    // Ring 0 is untouched -- this must succeed instantly, not time out.
     auto r0 = a.alloc(HEAP_ALIGN, /*scope_depth=*/0);
     EXPECT_EQ(r0.ring_idx, 0);
     ASSERT_NE(r0.heap_ptr, nullptr);
@@ -322,7 +322,7 @@ TEST(Ring, InnerRingReclaimsWhileOuterHolds) {
     EXPECT_EQ(a.heap_top(0), HEAP_ALIGN);
     EXPECT_EQ(a.heap_tail(0), 0u);
 
-    // Churn on the inner ring — allocate, release, allocate, release, ...
+    // Churn on the inner ring -- allocate, release, allocate, release, ...
     for (int i = 0; i < 8; ++i) {
         auto inner = a.alloc(HEAP_ALIGN, /*scope_depth=*/1);
         a.release(inner.slot);
@@ -331,7 +331,7 @@ TEST(Ring, InnerRingReclaimsWhileOuterHolds) {
     // Outer ring unchanged (one live slab at offset 0).
     EXPECT_EQ(a.heap_top(0), HEAP_ALIGN);
     EXPECT_EQ(a.heap_tail(0), 0u);
-    // Inner ring reclaimed each slab — tail caught up to top.
+    // Inner ring reclaimed each slab -- tail caught up to top.
     EXPECT_EQ(a.heap_tail(1), a.heap_top(1));
 
     a.release(outer.slot);
diff --git a/tests/ut/cpp/test_scheduler.cpp b/tests/ut/cpp/hierarchical/test_scheduler.cpp
similarity index 98%
rename from tests/ut/cpp/test_scheduler.cpp
rename to tests/ut/cpp/hierarchical/test_scheduler.cpp
index f13dd240f..87c50a895 100644
--- a/tests/ut/cpp/test_scheduler.cpp
+++ b/tests/ut/cpp/hierarchical/test_scheduler.cpp
@@ -205,7 +205,7 @@ TEST_F(SchedulerFixture, DependentTaskDispatchedAfterProducerCompletes) {
 }
 
 // ===========================================================================
-// Group task tests — fixture with 2 MockWorkers
+// Group task tests -- fixture with 2 MockWorkers
 // ===========================================================================
 
 struct GroupSchedulerFixture : public ::testing::Test {
@@ -405,7 +405,7 @@ TEST_F(MixedTypeSchedulerFixture, SubTaskDispatchesWhileNextLevelPoolSaturated)
     EXPECT_TRUE(next_level_worker.is_running.load()) << "chip worker must still be busy";
 
     // Complete the sub task first; it reaches CONSUMED while the chip task
-    // is still running — demonstrating independent per-type dispatch.
+    // is still running -- demonstrating independent per-type dispatch.
     sub_worker.complete();
     wait_consumed(sub.task_slot);
     EXPECT_FALSE(is_consumed(chip.task_slot));
@@ -416,7 +416,7 @@ TEST_F(MixedTypeSchedulerFixture, SubTaskDispatchesWhileNextLevelPoolSaturated)
 
 TEST_F(GroupSchedulerFixture, GroupDependencyChain) {
     // Group A (2 workers) produces an OUTPUT at key 0xCAFE.
-    // Task B reads INPUT at the same key — depends on group A.
+    // Task B reads INPUT at the same key -- depends on group A.
     TaskArgs a0 = single_tensor_args(0xCAFE, TensorArgType::OUTPUT);
     TaskArgs a1 = single_tensor_args(0xCAFE, TensorArgType::OUTPUT);
     auto a = orch.submit_next_level_group(0xDEAD, {a0, a1}, cfg);
diff --git a/tests/ut/cpp/test_scope.cpp b/tests/ut/cpp/hierarchical/test_scope.cpp
similarity index 97%
rename from tests/ut/cpp/test_scope.cpp
rename to tests/ut/cpp/hierarchical/test_scope.cpp
index d8350d1c6..273b33bfc 100644
--- a/tests/ut/cpp/test_scope.cpp
+++ b/tests/ut/cpp/hierarchical/test_scope.cpp
@@ -43,7 +43,7 @@ TEST(Scope, SingleScope_ReleasesRegisteredTasks) {
 
 TEST(Scope, RegisterOutsideScopeIsNoop) {
     Scope sc;
-    sc.register_task(5);  // no open scope — should not throw
+    sc.register_task(5);  // no open scope -- should not throw
     EXPECT_EQ(sc.depth(), 0);
 }
 
diff --git a/tests/ut/cpp/test_tensormap.cpp b/tests/ut/cpp/hierarchical/test_tensormap.cpp
similarity index 100%
rename from tests/ut/cpp/test_tensormap.cpp
rename to tests/ut/cpp/hierarchical/test_tensormap.cpp
diff --git a/tests/ut/cpp/hierarchical/test_worker_manager.cpp b/tests/ut/cpp/hierarchical/test_worker_manager.cpp
new file mode 100644
index 000000000..54985eed4
--- /dev/null
+++ b/tests/ut/cpp/hierarchical/test_worker_manager.cpp
@@ -0,0 +1,423 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * WorkerManager / WorkerThread independent UT.
+ *
+ * Tests the worker-pool lifecycle, idle selection, dispatch, and group dispatch
+ * in THREAD mode using a lightweight MockWorker.  PROCESS mode requires fork +
+ * shared-memory children and is covered by the Python integration tests.
+ *
+ * Follows UT development guidelines:
+ *   - AAA pattern (Arrange / Act / Assert)
+ *   - FIRST (Fast / Independent / Repeatable / Self-validating / Timely)
+ *   - Single responsibility per test
+ *   - Naming: Method_Scenario_ExpectedResult
+ *   - Mock/Stub for external deps (IWorker)
+ */
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <chrono>
+#include <condition_variable>
+#include <cstdint>
+#include <mutex>
+#include <thread>
+#include <vector>
+
+#include "ring.h"
+#include "types.h"
+#include "worker.h"
+#include "worker_manager.h"
+
+namespace {
+
+// ---------------------------------------------------------------------------
+// CountingWorker: records run() calls, optionally blocks until released.
+// ---------------------------------------------------------------------------
+
+struct CountingWorker : public IWorker {
+    std::atomic<int> run_count{0};
+    std::atomic<bool> is_running{false};
+
+    std::mutex mu;
+    std::condition_variable cv;
+    bool should_complete{false};
+    bool blocking{false};
+
+    explicit CountingWorker(bool blocking_ = false) :
+        blocking(blocking_) {}
+
+    void run(uint64_t /*callable*/, TaskArgsView /*args*/, const ChipCallConfig & /*config*/) override {
+        run_count.fetch_add(1, std::memory_order_relaxed);
+        if (blocking) {
+            is_running.store(true, std::memory_order_release);
+            std::unique_lock<std::mutex> lk(mu);
+            cv.wait(lk, [this] {
+                return should_complete;
+            });
+            should_complete = false;
+            is_running.store(false, std::memory_order_release);
+        }
+    }
+
+    void complete() {
+        std::lock_guard<std::mutex> lk(mu);
+        should_complete = true;
+        cv.notify_one();
+    }
+
+    void wait_running(int timeout_ms = 500) {
+        auto deadline = std::chrono::steady_clock::now() + std::chrono::milliseconds(timeout_ms);
+        while (!is_running.load(std::memory_order_acquire) && std::chrono::steady_clock::now() < deadline) {
+            std::this_thread::sleep_for(std::chrono::milliseconds(1));
+        }
+    }
+};
+
+// ---------------------------------------------------------------------------
+// Fixture: creates a Ring and provides start/stop helpers.
+// ---------------------------------------------------------------------------
+
+class WorkerManagerTest : public ::testing::Test {
+protected:
+    Ring ring_;
+    WorkerManager manager_;
+    std::vector<TaskSlot> completed_slots_;
+    std::mutex completed_mu_;
+
+    void SetUp() override { ring_.init(/*heap_bytes=*/1ULL << 16); }
+
+    void TearDown() override {
+        manager_.stop();
+        ring_.shutdown();
+    }
+
+    void start_manager() {
+        manager_.start(&ring_, [this](TaskSlot slot) {
+            std::lock_guard<std::mutex> lk(completed_mu_);
+            completed_slots_.push_back(slot);
+        });
+    }
+
+    void wait_completed(int expected, int timeout_ms = 500) {
+        auto deadline = std::chrono::steady_clock::now() + std::chrono::milliseconds(timeout_ms);
+        while (std::chrono::steady_clock::now() < deadline) {
+            {
+                std::lock_guard<std::mutex> lk(completed_mu_);
+                if (static_cast<int>(completed_slots_.size()) >= expected) return;
+            }
+            std::this_thread::sleep_for(std::chrono::milliseconds(1));
+        }
+    }
+};
+
+}  // namespace
+
+// =============================================================================
+// WorkerManager registration
+// =============================================================================
+
+TEST_F(WorkerManagerTest, AddNextLevel_BeforeStart_PickIdleReturnsWorkerAfterStart) {
+    // Arrange
+    CountingWorker w;
+    manager_.add_next_level(&w);
+
+    // Act
+    start_manager();
+
+    // Assert
+    WorkerThread *idle = manager_.pick_idle(WorkerType::NEXT_LEVEL);
+    ASSERT_NE(idle, nullptr);
+    EXPECT_TRUE(idle->idle());
+}
+
+TEST_F(WorkerManagerTest, AddSub_BeforeStart_PickIdleReturnsSubWorker) {
+    CountingWorker w;
+    manager_.add_sub(&w);
+    start_manager();
+
+    EXPECT_NE(manager_.pick_idle(WorkerType::SUB), nullptr);
+    EXPECT_EQ(manager_.pick_idle(WorkerType::NEXT_LEVEL), nullptr);
+}
+
+TEST_F(WorkerManagerTest, NoWorkers_PickIdleReturnsNull) {
+    start_manager();
+    EXPECT_EQ(manager_.pick_idle(WorkerType::NEXT_LEVEL), nullptr);
+    EXPECT_EQ(manager_.pick_idle(WorkerType::SUB), nullptr);
+}
+
+// =============================================================================
+// WorkerManager::start -- null ring
+// =============================================================================
+
+TEST_F(WorkerManagerTest, Start_NullRing_Throws) {
+    CountingWorker w;
+    manager_.add_next_level(&w);
+    EXPECT_THROW(manager_.start(nullptr, [](TaskSlot) {}), std::invalid_argument);
+}
+
+// =============================================================================
+// Dispatch (THREAD mode)
+// =============================================================================
+
+TEST_F(WorkerManagerTest, Dispatch_SingleTask_WorkerRunsAndCompletes) {
+    // Arrange: one blocking worker
+    CountingWorker w(/*blocking=*/true);
+    manager_.add_next_level(&w);
+    start_manager();
+
+    // Allocate a slot in the ring so there is a valid TaskSlotState.
+    AllocResult ar = ring_.alloc(0);
+    TaskSlot slot = ar.slot;
+    ASSERT_NE(slot, INVALID_SLOT);
+    TaskSlotState &s = *ring_.slot_state(slot);
+    s.worker_type = WorkerType::NEXT_LEVEL;
+    s.callable = 0xABC;
+
+    // Act: dispatch the task
+    WorkerThread *wt = manager_.pick_idle(WorkerType::NEXT_LEVEL);
+    ASSERT_NE(wt, nullptr);
+    wt->dispatch({slot, 0});
+
+    // Assert: worker is running
+    w.wait_running();
+    EXPECT_TRUE(w.is_running.load());
+    EXPECT_FALSE(wt->idle());
+
+    // Release the worker
+    w.complete();
+    wait_completed(1);
+
+    // Worker is idle again
+    auto deadline = std::chrono::steady_clock::now() + std::chrono::milliseconds(200);
+    while (!wt->idle() && std::chrono::steady_clock::now() < deadline) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(1));
+    }
+    EXPECT_TRUE(wt->idle());
+    EXPECT_EQ(w.run_count.load(), 1);
+}
+
+TEST_F(WorkerManagerTest, Dispatch_MultipleWorkers_BothExecute) {
+    // Arrange: two blocking workers
+    CountingWorker w1(true), w2(true);
+    manager_.add_next_level(&w1);
+    manager_.add_next_level(&w2);
+    start_manager();
+
+    // Two slots
+    AllocResult ar1 = ring_.alloc(0);
+    AllocResult ar2 = ring_.alloc(0);
+    TaskSlot s1 = ar1.slot;
+    TaskSlot s2 = ar2.slot;
+    ring_.slot_state(s1)->worker_type = WorkerType::NEXT_LEVEL;
+    ring_.slot_state(s1)->callable = 1;
+    ring_.slot_state(s2)->worker_type = WorkerType::NEXT_LEVEL;
+    ring_.slot_state(s2)->callable = 2;
+
+    // Dispatch to both
+    auto *t1 = manager_.get_worker(WorkerType::NEXT_LEVEL, 0);
+    auto *t2 = manager_.get_worker(WorkerType::NEXT_LEVEL, 1);
+    ASSERT_NE(t1, nullptr);
+    ASSERT_NE(t2, nullptr);
+    t1->dispatch({s1, 0});
+    t2->dispatch({s2, 0});
+
+    w1.wait_running();
+    w2.wait_running();
+
+    // Both are running concurrently
+    EXPECT_TRUE(w1.is_running.load());
+    EXPECT_TRUE(w2.is_running.load());
+
+    w1.complete();
+    w2.complete();
+    wait_completed(2);
+}
+
+// =============================================================================
+// pick_idle / pick_n_idle / pick_idle_excluding
+// =============================================================================
+
+TEST_F(WorkerManagerTest, PickNIdle_ReturnsUpToN) {
+    CountingWorker w1, w2, w3;
+    manager_.add_next_level(&w1);
+    manager_.add_next_level(&w2);
+    manager_.add_next_level(&w3);
+    start_manager();
+
+    auto idle2 = manager_.pick_n_idle(WorkerType::NEXT_LEVEL, 2);
+    EXPECT_EQ(static_cast<int>(idle2.size()), 2);
+
+    auto idle10 = manager_.pick_n_idle(WorkerType::NEXT_LEVEL, 10);
+    EXPECT_EQ(static_cast<int>(idle10.size()), 3);
+}
+
+TEST_F(WorkerManagerTest, PickIdleExcluding_SkipsExcludedWorkers) {
+    CountingWorker w1, w2;
+    manager_.add_next_level(&w1);
+    manager_.add_next_level(&w2);
+    start_manager();
+
+    auto *t0 = manager_.get_worker(WorkerType::NEXT_LEVEL, 0);
+    auto *t1 = manager_.get_worker(WorkerType::NEXT_LEVEL, 1);
+
+    // Exclude t0 -> should get t1
+    auto *picked = manager_.pick_idle_excluding(WorkerType::NEXT_LEVEL, {t0});
+    EXPECT_EQ(picked, t1);
+
+    // Exclude both -> nullptr
+    auto *none = manager_.pick_idle_excluding(WorkerType::NEXT_LEVEL, {t0, t1});
+    EXPECT_EQ(none, nullptr);
+}
+
+// =============================================================================
+// get_worker -- index bounds
+// =============================================================================
+
+TEST_F(WorkerManagerTest, GetWorker_ValidIndex_ReturnsThread) {
+    CountingWorker w;
+    manager_.add_next_level(&w);
+    start_manager();
+
+    EXPECT_NE(manager_.get_worker(WorkerType::NEXT_LEVEL, 0), nullptr);
+}
+
+TEST_F(WorkerManagerTest, GetWorker_NegativeIndex_ReturnsNull) {
+    CountingWorker w;
+    manager_.add_next_level(&w);
+    start_manager();
+
+    EXPECT_EQ(manager_.get_worker(WorkerType::NEXT_LEVEL, -1), nullptr);
+}
+
+TEST_F(WorkerManagerTest, GetWorker_OutOfBoundsIndex_ReturnsNull) {
+    CountingWorker w;
+    manager_.add_next_level(&w);
+    start_manager();
+
+    EXPECT_EQ(manager_.get_worker(WorkerType::NEXT_LEVEL, 99), nullptr);
+}
+
+// =============================================================================
+// any_busy
+// =============================================================================
+
+TEST_F(WorkerManagerTest, AnyBusy_AllIdle_ReturnsFalse) {
+    CountingWorker w;
+    manager_.add_next_level(&w);
+    start_manager();
+
+    EXPECT_FALSE(manager_.any_busy());
+}
+
+TEST_F(WorkerManagerTest, AnyBusy_OneRunning_ReturnsTrue) {
+    CountingWorker w(/*blocking=*/true);
+    manager_.add_next_level(&w);
+    start_manager();
+
+    AllocResult ar = ring_.alloc(0);
+    TaskSlot slot = ar.slot;
+    ring_.slot_state(slot)->worker_type = WorkerType::NEXT_LEVEL;
+
+    manager_.pick_idle(WorkerType::NEXT_LEVEL)->dispatch({slot, 0});
+    w.wait_running();
+
+    EXPECT_TRUE(manager_.any_busy());
+
+    w.complete();
+    wait_completed(1);
+}
+
+// =============================================================================
+// Worker lifecycle (top-level Worker class)
+// =============================================================================
+
+TEST(WorkerLifecycleTest, Construct_InitClose_NoCrash) {
+    // Arrange
+    CountingWorker mock;
+
+    // Act: full lifecycle without dispatching anything
+    Worker worker(/*level=*/3, /*heap_ring_size=*/1ULL << 16);
+    worker.add_worker(WorkerType::NEXT_LEVEL, &mock);
+    worker.init();
+    worker.close();
+
+    // Assert: reaching here without crash/hang is the observable signal.
+    SUCCEED();
+}
+
+TEST(WorkerLifecycleTest, AddWorker_AfterInit_Throws) {
+    CountingWorker mock;
+    Worker worker(3, 1ULL << 16);
+    worker.add_worker(WorkerType::NEXT_LEVEL, &mock);
+    worker.init();
+
+    CountingWorker extra;
+    EXPECT_THROW(worker.add_worker(WorkerType::NEXT_LEVEL, &extra), std::runtime_error);
+
+    worker.close();
+}
+
+TEST(WorkerLifecycleTest, DoubleInit_Throws) {
+    CountingWorker mock;
+    Worker worker(3, 1ULL << 16);
+    worker.add_worker(WorkerType::NEXT_LEVEL, &mock);
+    worker.init();
+
+    EXPECT_THROW(worker.init(), std::runtime_error);
+
+    worker.close();
+}
+
+TEST(WorkerLifecycleTest, CloseWithoutInit_Noop) {
+    Worker worker(3, 1ULL << 16);
+    worker.close();
+    SUCCEED();
+}
+
+TEST(WorkerLifecycleTest, DestructorCallsClose) {
+    CountingWorker mock;
+    {
+        Worker worker(3, 1ULL << 16);
+        worker.add_worker(WorkerType::NEXT_LEVEL, &mock);
+        worker.init();
+        // destructor should call close() without hang/crash
+    }
+    SUCCEED();
+}
+
+TEST(WorkerLifecycleTest, RunCallback_InvokedWhenSet) {
+    Worker worker(3, 1ULL << 16);
+
+    int callback_count = 0;
+    worker.set_run_callback([&](uint64_t, TaskArgsView, const ChipCallConfig &) {
+        callback_count++;
+    });
+
+    // Worker::run delegates to run_callback_
+    ChipCallConfig cfg{};
+    TaskArgsView view{};
+    worker.run(42, view, cfg);
+
+    EXPECT_EQ(callback_count, 1);
+}
+
+TEST(WorkerLifecycleTest, RunWithoutCallback_Noop) {
+    Worker worker(3, 1ULL << 16);
+
+    ChipCallConfig cfg{};
+    TaskArgsView view{};
+    worker.run(42, view, cfg);
+    // No crash when run_callback_ is empty.
+    SUCCEED();
+}
diff --git a/tests/ut/cpp/platform/test_platform_host_log.cpp b/tests/ut/cpp/platform/test_platform_host_log.cpp
new file mode 100644
index 000000000..3934f2b69
--- /dev/null
+++ b/tests/ut/cpp/platform/test_platform_host_log.cpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * UT for the host-side logging singleton (HostLogger).
+ *
+ * Covers the parts that are testable in pure userspace:
+ *   - level filtering (is_enabled)
+ *   - PTO_LOG_LEVEL env-var parsing via reinitialize()
+ *
+ * Hardware-specific sinks (e.g. device-side ring buffers) are out of scope.
+ */
+
+#include <gtest/gtest.h>
+
+#include <cstdlib>
+
+#include "host_log.h"
+
+namespace {
+
+// Test fixture: save/restore PTO_LOG_LEVEL so tests are independent regardless
+// of execution order or shell environment.
+class HostLoggerTest : public ::testing::Test {
+protected:
+    std::string saved_level_;
+    bool had_level_ = false;
+
+    void SetUp() override {
+        const char *env = std::getenv("PTO_LOG_LEVEL");
+        had_level_ = env != nullptr;
+        if (had_level_) saved_level_ = env;
+        unsetenv("PTO_LOG_LEVEL");
+    }
+
+    void TearDown() override {
+        if (had_level_) {
+            setenv("PTO_LOG_LEVEL", saved_level_.c_str(), 1);
+        } else {
+            unsetenv("PTO_LOG_LEVEL");
+        }
+        HostLogger::get_instance().reinitialize();  // restore logger to real env
+    }
+};
+
+}  // namespace
+
+// ---------- is_enabled at default level ----------
+
+TEST_F(HostLoggerTest, DefaultLevel_InfoEnabled_DebugDisabled) {
+    // Arrange: no env var -> default INFO.
+    HostLogger::get_instance().reinitialize();
+
+    // Assert
+    HostLogger &logger = HostLogger::get_instance();
+    EXPECT_TRUE(logger.is_enabled(HostLogLevel::ERROR));
+    EXPECT_TRUE(logger.is_enabled(HostLogLevel::WARN));
+    EXPECT_TRUE(logger.is_enabled(HostLogLevel::INFO));
+    EXPECT_FALSE(logger.is_enabled(HostLogLevel::DEBUG));
+}
+
+TEST_F(HostLoggerTest, AlwaysLevel_IsAlwaysEnabled) {
+    HostLogger::get_instance().reinitialize();
+    // ALWAYS == -1, so it should pass the <= check for any current_level_ >= 0.
+    EXPECT_TRUE(HostLogger::get_instance().is_enabled(HostLogLevel::ALWAYS));
+}
+
+// ---------- env-var parsing ----------
+
+TEST_F(HostLoggerTest, EnvLevelError_SilencesInfoAndWarn) {
+    setenv("PTO_LOG_LEVEL", "error", 1);
+    HostLogger::get_instance().reinitialize();
+
+    HostLogger &logger = HostLogger::get_instance();
+    EXPECT_TRUE(logger.is_enabled(HostLogLevel::ERROR));
+    EXPECT_FALSE(logger.is_enabled(HostLogLevel::WARN));
+    EXPECT_FALSE(logger.is_enabled(HostLogLevel::INFO));
+    EXPECT_FALSE(logger.is_enabled(HostLogLevel::DEBUG));
+}
+
+TEST_F(HostLoggerTest, EnvLevelDebug_EnablesAllNumericLevels) {
+    setenv("PTO_LOG_LEVEL", "debug", 1);
+    HostLogger::get_instance().reinitialize();
+
+    HostLogger &logger = HostLogger::get_instance();
+    EXPECT_TRUE(logger.is_enabled(HostLogLevel::ERROR));
+    EXPECT_TRUE(logger.is_enabled(HostLogLevel::WARN));
+    EXPECT_TRUE(logger.is_enabled(HostLogLevel::INFO));
+    EXPECT_TRUE(logger.is_enabled(HostLogLevel::DEBUG));
+}
+
+TEST_F(HostLoggerTest, EnvLevelMixedCase_IsCaseInsensitive) {
+    setenv("PTO_LOG_LEVEL", "WaRn", 1);
+    HostLogger::get_instance().reinitialize();
+
+    HostLogger &logger = HostLogger::get_instance();
+    EXPECT_TRUE(logger.is_enabled(HostLogLevel::WARN));
+    EXPECT_FALSE(logger.is_enabled(HostLogLevel::INFO));
+}
+
+TEST_F(HostLoggerTest, EnvLevelUnknown_FallsBackToInfo) {
+    setenv("PTO_LOG_LEVEL", "not_a_real_level", 1);
+    HostLogger::get_instance().reinitialize();
+
+    HostLogger &logger = HostLogger::get_instance();
+    EXPECT_TRUE(logger.is_enabled(HostLogLevel::INFO));
+    EXPECT_FALSE(logger.is_enabled(HostLogLevel::DEBUG));
+}
+
+// ---------- Smoke: log(...) does not crash ----------
+
+TEST_F(HostLoggerTest, LogAboveLevel_DoesNotCrash) {
+    setenv("PTO_LOG_LEVEL", "info", 1);
+    HostLogger::get_instance().reinitialize();
+
+    // Act: exercise the log formatter path.  Output goes to stdout/stderr; the
+    // observable property we assert is that no exception is thrown and the
+    // process does not abort.
+    HostLogger::get_instance().log(HostLogLevel::INFO, "unit-test info %d", 42);
+    HostLogger::get_instance().log(HostLogLevel::DEBUG, "this should be filtered");
+    SUCCEED();
+}
diff --git a/tests/ut/cpp/platform/test_platform_memory_allocator.cpp b/tests/ut/cpp/platform/test_platform_memory_allocator.cpp
new file mode 100644
index 000000000..d88221ef4
--- /dev/null
+++ b/tests/ut/cpp/platform/test_platform_memory_allocator.cpp
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * UT for the sim variant of src/a2a3/platform/sim/host/memory_allocator.cpp.
+ *
+ * The sim implementation wraps malloc/free and tracks live pointers in a
+ * std::set, making it directly unit-testable without any hardware or CANN
+ * runtime dependency.  Hardware (onboard) variants that call rtMalloc/rtFree
+ * are intentionally NOT covered here -- they need an Ascend device and are
+ * exercised by the hardware CI job.
+ */
+
+#include <gtest/gtest.h>
+
+#include <cstddef>
+#include <cstring>
+
+#include "host/memory_allocator.h"
+
+namespace {
+
+class MemoryAllocatorTest : public ::testing::Test {
+protected:
+    MemoryAllocator alloc_;
+};
+
+}  // namespace
+
+// ---------- Happy path ----------
+
+TEST_F(MemoryAllocatorTest, Alloc_ValidSize_ReturnsUsablePointer) {
+    // Act
+    void *p = alloc_.alloc(128);
+
+    // Assert
+    ASSERT_NE(p, nullptr);
+    EXPECT_EQ(alloc_.get_allocation_count(), 1U);
+
+    // Use memory to ensure it is real, writable storage.
+    std::memset(p, 0xAB, 128);
+
+    // Cleanup
+    EXPECT_EQ(alloc_.free(p), 0);
+    EXPECT_EQ(alloc_.get_allocation_count(), 0U);
+}
+
+TEST_F(MemoryAllocatorTest, Alloc_MultipleTracksAll) {
+    void *p1 = alloc_.alloc(16);
+    void *p2 = alloc_.alloc(32);
+    void *p3 = alloc_.alloc(64);
+    ASSERT_NE(p1, nullptr);
+    ASSERT_NE(p2, nullptr);
+    ASSERT_NE(p3, nullptr);
+
+    EXPECT_EQ(alloc_.get_allocation_count(), 3U);
+
+    alloc_.free(p1);
+    alloc_.free(p2);
+    alloc_.free(p3);
+    EXPECT_EQ(alloc_.get_allocation_count(), 0U);
+}
+
+// ---------- Free: edge cases ----------
+
+TEST_F(MemoryAllocatorTest, Free_Nullptr_ReturnsZeroNoop) {
+    // Contract: free(nullptr) is a safe no-op.
+    EXPECT_EQ(alloc_.free(nullptr), 0);
+    EXPECT_EQ(alloc_.get_allocation_count(), 0U);
+}
+
+TEST_F(MemoryAllocatorTest, Free_UntrackedPointer_ReturnsZeroLeavesStateAlone) {
+    // Allocate something to populate the set.
+    void *tracked = alloc_.alloc(8);
+    ASSERT_NE(tracked, nullptr);
+
+    // An unrelated, untracked address must NOT be freed -- memory_allocator
+    // only calls std::free() for pointers it allocated itself.
+    int stack_int = 0;
+    EXPECT_EQ(alloc_.free(&stack_int), 0);
+    EXPECT_EQ(alloc_.get_allocation_count(), 1U);
+
+    alloc_.free(tracked);
+}
+
+TEST_F(MemoryAllocatorTest, Free_SamePointerTwice_SecondCallIsNoop) {
+    void *p = alloc_.alloc(8);
+    ASSERT_NE(p, nullptr);
+
+    EXPECT_EQ(alloc_.free(p), 0);
+    EXPECT_EQ(alloc_.get_allocation_count(), 0U);
+
+    // Second free on the same (now untracked) address: no crash, no state change.
+    EXPECT_EQ(alloc_.free(p), 0);
+    EXPECT_EQ(alloc_.get_allocation_count(), 0U);
+}
+
+// ---------- finalize ----------
+
+TEST_F(MemoryAllocatorTest, Finalize_FreesAllTrackedAllocations) {
+    // Arrange
+    (void)alloc_.alloc(16);
+    (void)alloc_.alloc(32);
+    ASSERT_EQ(alloc_.get_allocation_count(), 2U);
+
+    // Act
+    EXPECT_EQ(alloc_.finalize(), 0);
+
+    // Assert
+    EXPECT_EQ(alloc_.get_allocation_count(), 0U);
+}
+
+TEST_F(MemoryAllocatorTest, Finalize_IdempotentWhenEmpty) {
+    EXPECT_EQ(alloc_.finalize(), 0);
+    EXPECT_EQ(alloc_.finalize(), 0);
+    EXPECT_EQ(alloc_.get_allocation_count(), 0U);
+}
+
+TEST_F(MemoryAllocatorTest, Destructor_CallsFinalizeAutomatically) {
+    // Use a local-scope allocator to trigger RAII cleanup.
+    size_t count_after_destruct = 0;
+    {
+        MemoryAllocator scoped;
+        (void)scoped.alloc(16);
+        (void)scoped.alloc(32);
+        ASSERT_EQ(scoped.get_allocation_count(), 2U);
+        // scoped goes out of scope here -- destructor must free the 2 allocations.
+    }
+    // We can't query the destroyed allocator, but reaching here without leak
+    // reports under asan/ubsan is the observable signal.  A fresh allocator
+    // starts at zero -- confirm basic post-condition.
+    MemoryAllocator fresh;
+    count_after_destruct = fresh.get_allocation_count();
+    EXPECT_EQ(count_after_destruct, 0U);
+}
diff --git a/tests/ut/cpp/test_a2a3_pto2_fatal.cpp b/tests/ut/cpp/pto2_a2a3/test_a2a3_pto2_fatal.cpp
similarity index 90%
rename from tests/ut/cpp/test_a2a3_pto2_fatal.cpp
rename to tests/ut/cpp/pto2_a2a3/test_a2a3_pto2_fatal.cpp
index b4e2c8e00..1ea2aa042 100644
--- a/tests/ut/cpp/test_a2a3_pto2_fatal.cpp
+++ b/tests/ut/cpp/pto2_a2a3/test_a2a3_pto2_fatal.cpp
@@ -41,6 +41,8 @@ struct FakeRuntime {
     std::string last_fatal_message;
 };
 
+static_assert(offsetof(FakeRuntime, ops) == 0);  // Guard: reinterpret_cast below assumes ops is first member.
+
 FakeRuntime *as_fake(PTO2Runtime *rt) { return reinterpret_cast<FakeRuntime *>(rt); }
 
 TaskOutputTensors fake_submit(PTO2Runtime *rt, const MixedKernels &, const Arg &) {
@@ -87,20 +89,20 @@ TaskOutputTensors fake_alloc_tensors(PTO2Runtime *rt, const Arg &) {
 }
 
 const PTO2RuntimeOps kFakeOps = {
-    fake_submit,
-    fake_scope_begin,
-    fake_scope_end,
-    fake_orchestration_done,
-    fake_is_fatal,
-    fake_report_fatal,
-    fake_log,
-    fake_log,
-    fake_log,
-    fake_log,
-    fake_log,
-    fake_get_tensor_data,
-    fake_set_tensor_data,
-    fake_alloc_tensors,
+    .submit_task = fake_submit,
+    .scope_begin = fake_scope_begin,
+    .scope_end = fake_scope_end,
+    .orchestration_done = fake_orchestration_done,
+    .is_fatal = fake_is_fatal,
+    .report_fatal = fake_report_fatal,
+    .log_error = fake_log,
+    .log_warn = fake_log,
+    .log_info = fake_log,
+    .log_debug = fake_log,
+    .log_always = fake_log,
+    .get_tensor_data = fake_get_tensor_data,
+    .set_tensor_data = fake_set_tensor_data,
+    .alloc_tensors = fake_alloc_tensors,
 };
 
 class RuntimeBindingGuard {
diff --git a/tests/ut/cpp/pto2_a2a3/test_boundary_edge.cpp b/tests/ut/cpp/pto2_a2a3/test_boundary_edge.cpp
new file mode 100644
index 000000000..b17ff85ed
--- /dev/null
+++ b/tests/ut/cpp/pto2_a2a3/test_boundary_edge.cpp
@@ -0,0 +1,693 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Supplemental boundary-condition tests for:
+ *   1. ReadyQueue high-contention stress (8+ threads, exactly-once guarantee)
+ *   2. TaskAllocator double-destroy / re-init safety
+ *   3. Scheduler sequence counter near INT64 wrap
+ *   4. SharedMemory concurrent read/write of per-ring flow control
+ */
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <cstring>
+#include <set>
+#include <thread>
+#include <vector>
+
+#include "pto_ring_buffer.h"
+#include "pto_scheduler.h"
+#include "pto_shared_memory.h"
+#include "../test_helpers.h"
+
+// =============================================================================
+// 1. ReadyQueue high-contention stress
+// =============================================================================
+
+class ReadyQueueStressTest : public ::testing::Test {
+protected:
+    static constexpr uint64_t kCapacity = 512;
+    PTO2ReadyQueue queue;
+
+    void SetUp() override { ASSERT_TRUE(pto2_ready_queue_init(&queue, kCapacity)); }
+
+    void TearDown() override { pto2_ready_queue_destroy(&queue); }
+};
+
+// 8 producers / 8 consumers, high volume -- every item consumed exactly once
+TEST_F(ReadyQueueStressTest, EightProducersEightConsumers) {
+    constexpr int kItemsPerProducer = 2000;
+    constexpr int kProducers = 8;
+    constexpr int kConsumers = 8;
+    constexpr int kTotalItems = kItemsPerProducer * kProducers;
+
+    std::vector<PTO2TaskSlotState> items(kTotalItems);
+    for (int i = 0; i < kTotalItems; i++) {
+        items[i].fanin_count = i;
+    }
+
+    std::vector<std::atomic<int>> consumed_count(kTotalItems);
+    for (auto &c : consumed_count)
+        c.store(0, std::memory_order_relaxed);
+
+    std::atomic<int> producers_done{0};
+
+    auto producer = [&](int id) {
+        int base = id * kItemsPerProducer;
+        for (int i = 0; i < kItemsPerProducer; i++) {
+            while (!queue.push(&items[base + i])) {}
+        }
+        producers_done.fetch_add(1, std::memory_order_release);
+    };
+
+    auto consumer = [&](std::atomic<int> &local_count) {
+        while (true) {
+            PTO2TaskSlotState *item = queue.pop();
+            if (item) {
+                consumed_count[item->fanin_count].fetch_add(1, std::memory_order_relaxed);
+                local_count.fetch_add(1, std::memory_order_relaxed);
+            } else if (producers_done.load(std::memory_order_acquire) == kProducers) {
+                // Final drain
+                while ((item = queue.pop()) != nullptr) {
+                    consumed_count[item->fanin_count].fetch_add(1, std::memory_order_relaxed);
+                    local_count.fetch_add(1, std::memory_order_relaxed);
+                }
+                break;
+            }
+        }
+    };
+
+    std::vector<std::atomic<int>> per_consumer_count(kConsumers);
+    for (auto &c : per_consumer_count)
+        c.store(0);
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i < kProducers; i++) {
+        threads.emplace_back(producer, i);
+    }
+    for (int i = 0; i < kConsumers; i++) {
+        threads.emplace_back(consumer, std::ref(per_consumer_count[i]));
+    }
+    for (auto &t : threads)
+        t.join();
+
+    // Every item consumed exactly once
+    int total = 0;
+    for (int i = 0; i < kTotalItems; i++) {
+        EXPECT_EQ(consumed_count[i].load(), 1) << "Item " << i << " consumed " << consumed_count[i].load() << " times";
+        total += consumed_count[i].load();
+    }
+    EXPECT_EQ(total, kTotalItems);
+
+    // Work is distributed across consumers (not all consumed by one)
+    int active_consumers = 0;
+    for (int i = 0; i < kConsumers; i++) {
+        if (per_consumer_count[i].load() > 0) active_consumers++;
+    }
+    EXPECT_GT(active_consumers, 1) << "Work should be distributed across multiple consumers";
+}
+
+// Rapid fill-drain cycles under contention
+TEST_F(ReadyQueueStressTest, RapidFillDrainCycles) {
+    constexpr int kCycles = 100;
+    constexpr int kItemsPerCycle = static_cast<int>(kCapacity / 2);
+
+    std::vector<PTO2TaskSlotState> items(kItemsPerCycle);
+    for (int i = 0; i < kItemsPerCycle; i++) {
+        items[i].fanin_count = i;
+    }
+
+    for (int cycle = 0; cycle < kCycles; cycle++) {
+        std::atomic<int> push_done{0};
+        std::atomic<int> popped{0};
+
+        // 4 producers push in parallel
+        auto producer = [&](int id) {
+            int per_thread = kItemsPerCycle / 4;
+            int base = id * per_thread;
+            for (int i = 0; i < per_thread; i++) {
+                while (!queue.push(&items[base + i])) {}
+            }
+            push_done.fetch_add(1, std::memory_order_release);
+        };
+
+        // 4 consumers drain in parallel
+        auto consumer = [&]() {
+            while (true) {
+                PTO2TaskSlotState *s = queue.pop();
+                if (s) {
+                    popped.fetch_add(1, std::memory_order_relaxed);
+                } else if (push_done.load(std::memory_order_acquire) == 4) {
+                    while ((s = queue.pop()) != nullptr) {
+                        popped.fetch_add(1, std::memory_order_relaxed);
+                    }
+                    break;
+                }
+            }
+        };
+
+        std::vector<std::thread> threads;
+        for (int i = 0; i < 4; i++)
+            threads.emplace_back(producer, i);
+        for (int i = 0; i < 4; i++)
+            threads.emplace_back(consumer);
+        for (auto &t : threads)
+            t.join();
+
+        ASSERT_EQ(popped.load(), kItemsPerCycle) << "Cycle " << cycle << ": lost items";
+    }
+}
+
+// push_batch + pop_batch under contention
+TEST_F(ReadyQueueStressTest, BatchPushPopContention) {
+    constexpr int kBatchSize = 8;
+    constexpr int kBatches = 500;
+    constexpr int kProducers = 4;
+    constexpr int kTotalItems = kBatchSize * kBatches * kProducers;
+
+    std::vector<PTO2TaskSlotState> items(kTotalItems);
+    for (int i = 0; i < kTotalItems; i++)
+        items[i].fanin_count = i;
+
+    std::atomic<int> total_consumed{0};
+    std::atomic<int> producers_done{0};
+
+    auto producer = [&](int id) {
+        int base = id * kBatchSize * kBatches;
+        for (int b = 0; b < kBatches; b++) {
+            PTO2TaskSlotState *ptrs[kBatchSize];
+            for (int i = 0; i < kBatchSize; i++) {
+                ptrs[i] = &items[base + b * kBatchSize + i];
+            }
+            // push_batch may partially fail if queue is near full; retry
+            for (int i = 0; i < kBatchSize; i++) {
+                while (!queue.push(ptrs[i])) {}
+            }
+        }
+        producers_done.fetch_add(1, std::memory_order_release);
+    };
+
+    auto consumer = [&]() {
+        while (true) {
+            PTO2TaskSlotState *out[kBatchSize];
+            int n = queue.pop_batch(out, kBatchSize);
+            total_consumed.fetch_add(n, std::memory_order_relaxed);
+            if (n == 0 && producers_done.load(std::memory_order_acquire) == kProducers) {
+                // Final drain
+                while (true) {
+                    n = queue.pop_batch(out, kBatchSize);
+                    if (n == 0) break;
+                    total_consumed.fetch_add(n, std::memory_order_relaxed);
+                }
+                break;
+            }
+        }
+    };
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i < kProducers; i++)
+        threads.emplace_back(producer, i);
+    for (int i = 0; i < 4; i++)
+        threads.emplace_back(consumer);
+    for (auto &t : threads)
+        t.join();
+
+    EXPECT_EQ(total_consumed.load(), kTotalItems);
+}
+
+// =============================================================================
+// 2. TaskAllocator double-destroy / re-init safety
+// =============================================================================
+
+class TaskAllocatorDoubleDestroyTest : public ::testing::Test {
+protected:
+    static constexpr int32_t WINDOW_SIZE = 16;
+    static constexpr uint64_t HEAP_SIZE = 1024;
+
+    std::vector<PTO2TaskDescriptor> descriptors;
+    alignas(64) uint8_t heap_buf[1024]{};
+    std::atomic<int32_t> current_index{0};
+    std::atomic<int32_t> last_alive{0};
+    std::atomic<int32_t> error_code{PTO2_ERROR_NONE};
+    PTO2TaskAllocator allocator{};
+
+    void InitAllocator() {
+        descriptors.assign(WINDOW_SIZE, PTO2TaskDescriptor{});
+        std::memset(heap_buf, 0, sizeof(heap_buf));
+        current_index.store(0);
+        last_alive.store(0);
+        error_code.store(PTO2_ERROR_NONE);
+        allocator.init(descriptors.data(), WINDOW_SIZE, &current_index, &last_alive, heap_buf, HEAP_SIZE, &error_code);
+    }
+};
+
+// Re-init after use: allocator should work fresh
+TEST_F(TaskAllocatorDoubleDestroyTest, ReInitAfterUse) {
+    InitAllocator();
+
+    // Use the allocator
+    auto r1 = allocator.alloc(128);
+    ASSERT_FALSE(r1.failed());
+    auto r2 = allocator.alloc(128);
+    ASSERT_FALSE(r2.failed());
+    EXPECT_EQ(r2.task_id, 1);
+
+    // Re-init: should reset state
+    InitAllocator();
+
+    // Should start from task_id 0 again
+    auto r3 = allocator.alloc(64);
+    ASSERT_FALSE(r3.failed());
+    EXPECT_EQ(r3.task_id, 0) << "Re-init should reset task ID counter";
+    EXPECT_EQ(r3.slot, 0);
+}
+
+// Re-init with different heap size
+TEST_F(TaskAllocatorDoubleDestroyTest, ReInitDifferentHeapSize) {
+    InitAllocator();
+
+    auto r1 = allocator.alloc(HEAP_SIZE);
+    ASSERT_FALSE(r1.failed());
+    EXPECT_EQ(allocator.heap_top(), HEAP_SIZE);
+
+    // Re-init with same buffer but fresh state
+    InitAllocator();
+    EXPECT_EQ(allocator.heap_top(), 0u) << "Re-init resets heap_top";
+    EXPECT_EQ(allocator.heap_available(), HEAP_SIZE) << "Re-init restores full capacity";
+}
+
+// Re-init after error state: error flag should be clearable
+TEST_F(TaskAllocatorDoubleDestroyTest, ReInitClearsErrorState) {
+    InitAllocator();
+
+    // Force a deadlock error
+    auto r = allocator.alloc(HEAP_SIZE * 2);
+    EXPECT_TRUE(r.failed());
+    EXPECT_NE(error_code.load(), PTO2_ERROR_NONE);
+
+    // Re-init clears error
+    InitAllocator();
+    EXPECT_EQ(error_code.load(), PTO2_ERROR_NONE);
+
+    // Allocator should work again
+    auto r2 = allocator.alloc(64);
+    EXPECT_FALSE(r2.failed());
+}
+
+// Multiple re-init cycles: no resource leak or corruption
+TEST_F(TaskAllocatorDoubleDestroyTest, MultipleReInitCycles) {
+    for (int cycle = 0; cycle < 10; cycle++) {
+        InitAllocator();
+
+        for (int i = 0; i < WINDOW_SIZE - 1; i++) {
+            auto r = allocator.alloc(0);
+            ASSERT_FALSE(r.failed()) << "Cycle " << cycle << " alloc " << i;
+            EXPECT_EQ(r.task_id, i);
+        }
+    }
+}
+
+// Re-init with stale last_alive: allocator sees fresh state
+TEST_F(TaskAllocatorDoubleDestroyTest, ReInitIgnoresStaleLastAlive) {
+    InitAllocator();
+
+    // Advance state
+    auto r1 = allocator.alloc(64);
+    ASSERT_FALSE(r1.failed());
+    last_alive.store(5, std::memory_order_release);  // Stale value
+
+    // Re-init resets last_alive
+    InitAllocator();
+    EXPECT_EQ(last_alive.load(), 0);
+
+    auto r2 = allocator.alloc(64);
+    ASSERT_FALSE(r2.failed());
+    EXPECT_EQ(r2.task_id, 0);
+}
+
+// =============================================================================
+// 3. Scheduler sequence counter near INT64 wrap
+// =============================================================================
+
+class SequenceWrapTest : public ::testing::Test {
+protected:
+    static constexpr uint64_t QUEUE_CAP = 8;
+    PTO2ReadyQueueSlot slots[8]{};
+    PTO2ReadyQueue queue{};
+    PTO2TaskSlotState dummy[8]{};
+
+    void InitQueueAtSequence(int64_t start_seq) { test_ready_queue_init(&queue, slots, QUEUE_CAP, start_seq); }
+};
+
+// Sequence near INT64_MAX: push/pop should still work
+TEST_F(SequenceWrapTest, NearInt64Max) {
+    int64_t near_max = INT64_MAX - 16;
+    InitQueueAtSequence(near_max);
+
+    // Push and pop several items, crossing INT64_MAX
+    for (int i = 0; i < 5; i++) {
+        ASSERT_TRUE(queue.push(&dummy[i])) << "Push " << i << " near INT64_MAX";
+    }
+
+    for (int i = 0; i < 5; i++) {
+        PTO2TaskSlotState *s = queue.pop();
+        ASSERT_NE(s, nullptr) << "Pop " << i << " near INT64_MAX";
+        EXPECT_EQ(s, &dummy[i]);
+    }
+    EXPECT_EQ(queue.pop(), nullptr);
+}
+
+// Sequence near INT64_MAX: fill to capacity then drain
+TEST_F(SequenceWrapTest, FillDrainNearMax) {
+    int64_t near_max = INT64_MAX - 4;
+    InitQueueAtSequence(near_max);
+
+    int pushed = 0;
+    for (uint64_t i = 0; i < QUEUE_CAP; i++) {
+        if (queue.push(&dummy[i % 8])) pushed++;
+        else break;
+    }
+    EXPECT_GE(pushed, 1) << "Should push at least some items near max";
+
+    for (int i = 0; i < pushed; i++) {
+        EXPECT_NE(queue.pop(), nullptr);
+    }
+    EXPECT_EQ(queue.pop(), nullptr);
+}
+
+// Sequence near INT64_MAX: interleaved push/pop crossing the boundary
+TEST_F(SequenceWrapTest, InterleavedAcrossBoundary) {
+    int64_t near_max = INT64_MAX - 2;
+    InitQueueAtSequence(near_max);
+
+    // Each push/pop advances sequence by 1; after 5 cycles we cross INT64_MAX
+    for (int i = 0; i < 10; i++) {
+        ASSERT_TRUE(queue.push(&dummy[0])) << "Push " << i << " at sequence ~" << (near_max + i);
+        PTO2TaskSlotState *s = queue.pop();
+        ASSERT_NE(s, nullptr) << "Pop " << i;
+        EXPECT_EQ(s, &dummy[0]);
+    }
+}
+
+// Sequence at exactly INT64_MAX: single push/pop
+TEST_F(SequenceWrapTest, ExactlyAtInt64Max) {
+    InitQueueAtSequence(INT64_MAX);
+
+    ASSERT_TRUE(queue.push(&dummy[0]));
+    PTO2TaskSlotState *s = queue.pop();
+    EXPECT_EQ(s, &dummy[0]);
+}
+
+// ---------------------------------------------------------------------------
+// BUG-CANDIDATE: pop() fast-path uses unsigned comparison `d >= e`.
+//
+// When enqueue_pos crosses INT64_MAX (as uint64_t), the arithmetic is still
+// valid for unsigned because uint64 wraps modularly.  However, inside push()
+// and pop(), `static_cast<int64_t>(pos)` reinterprets bits: a pos of
+// 0x8000000000000000 becomes INT64_MIN.  The sequence counters undergo the
+// same reinterpretation, so diff calculations remain consistent.
+//
+// The REAL concern is pop()'s fast-path: `if (d >= e) return nullptr`.
+// After enough operations, enqueue_pos wraps around UINT64_MAX back to a
+// small number while dequeue_pos is still large.  At that point d > e
+// (unsigned), causing pop() to return nullptr even though items are queued.
+//
+// This test starts positions near UINT64_MAX to simulate the wrap scenario.
+// It documents that UINT64_MAX overflow in enqueue_pos/dequeue_pos would
+// break the fast-path, but this requires 2^64 operations -- practically
+// unreachable.  We test the INT64 boundary (2^63) which IS reachable in
+// extremely long-running graphs.
+// ---------------------------------------------------------------------------
+TEST_F(SequenceWrapTest, PushBatchThenPopAcrossInt64Boundary) {
+    // Start at INT64_MAX - 2 so that after 3 pushes, enqueue_pos crosses
+    // into the INT64_MIN region (as signed), while dequeue_pos stays at
+    // INT64_MAX - 2.
+    int64_t start = INT64_MAX - 2;
+    InitQueueAtSequence(start);
+
+    // Push 5 items: pos goes INT64_MAX-2, -1, MAX, MAX+1, MAX+2
+    for (int i = 0; i < 5; i++) {
+        ASSERT_TRUE(queue.push(&dummy[i])) << "Push " << i << " failed (pos would be ~INT64_MAX+" << (i - 2) << ")";
+    }
+
+    // Pop all 5: dequeue_pos starts at INT64_MAX-2, catches up.
+    // The fast-path `d >= e` compares unsigned values; since both grow
+    // monotonically as uint64_t, this stays correct across the signed
+    // boundary.
+    for (int i = 0; i < 5; i++) {
+        PTO2TaskSlotState *s = queue.pop();
+        ASSERT_NE(s, nullptr) << "Pop " << i << " returned nullptr -- fast-path may have misjudged empty";
+        EXPECT_EQ(s, &dummy[i]);
+    }
+    EXPECT_EQ(queue.pop(), nullptr);
+}
+
+// Concurrent push/pop near INT64_MAX boundary
+TEST_F(SequenceWrapTest, ConcurrentNearMax) {
+    static constexpr uint64_t BIG_CAP = 64;
+    PTO2ReadyQueueSlot big_slots[BIG_CAP];
+    PTO2ReadyQueue big_queue{};
+    int64_t start = INT64_MAX - 500;
+    test_ready_queue_init(&big_queue, big_slots, BIG_CAP, start);
+
+    constexpr int N = 1000;
+    std::vector<PTO2TaskSlotState> items(N);
+    for (int i = 0; i < N; i++)
+        items[i].fanin_count = i;
+
+    std::atomic<int> consumed{0};
+    std::atomic<bool> prod_done{false};
+
+    auto producer = [&]() {
+        for (int i = 0; i < N; i++) {
+            while (!big_queue.push(&items[i])) {}
+        }
+        prod_done.store(true, std::memory_order_release);
+    };
+
+    auto consumer = [&]() {
+        while (true) {
+            PTO2TaskSlotState *s = big_queue.pop();
+            if (s) {
+                consumed.fetch_add(1, std::memory_order_relaxed);
+            } else if (prod_done.load(std::memory_order_acquire)) {
+                while ((s = big_queue.pop()) != nullptr) {
+                    consumed.fetch_add(1, std::memory_order_relaxed);
+                }
+                break;
+            }
+        }
+    };
+
+    std::thread p(producer);
+    std::thread c1(consumer);
+    std::thread c2(consumer);
+    p.join();
+    c1.join();
+    c2.join();
+
+    EXPECT_EQ(consumed.load(), N);
+}
+
+// =============================================================================
+// 4. SharedMemory concurrent read/write of per-ring flow control
+// =============================================================================
+
+class SharedMemoryConcurrentTest : public ::testing::Test {
+protected:
+    PTO2SharedMemoryHandle *handle = nullptr;
+
+    void SetUp() override {
+        handle = pto2_sm_create(256, 4096);
+        ASSERT_NE(handle, nullptr);
+    }
+
+    void TearDown() override {
+        if (handle) {
+            pto2_sm_destroy(handle);
+            handle = nullptr;
+        }
+    }
+};
+
+// Concurrent current_task_index updates across different rings: no cross-ring interference
+TEST_F(SharedMemoryConcurrentTest, PerRingTaskIndexIsolation) {
+    constexpr int kIterations = 10000;
+
+    auto writer = [&](int ring) {
+        auto &fc = handle->header->rings[ring].fc;
+        for (int i = 1; i <= kIterations; i++) {
+            fc.current_task_index.store(static_cast<int32_t>(i), std::memory_order_release);
+        }
+    };
+
+    auto reader = [&](int ring, bool *saw_other_ring_value) {
+        auto &fc = handle->header->rings[ring].fc;
+        int32_t prev = 0;
+        for (int i = 0; i < kIterations; i++) {
+            int32_t val = fc.current_task_index.load(std::memory_order_acquire);
+            // Values should be monotonically increasing within a ring
+            if (val < prev) {
+                *saw_other_ring_value = true;
+            }
+            prev = val;
+        }
+    };
+
+    // Write to ring 0 and ring 1 concurrently; read from each
+    bool ring0_corrupted = false;
+    bool ring1_corrupted = false;
+
+    std::thread w0(writer, 0);
+    std::thread w1(writer, 1);
+    std::thread r0(reader, 0, &ring0_corrupted);
+    std::thread r1(reader, 1, &ring1_corrupted);
+
+    w0.join();
+    w1.join();
+    r0.join();
+    r1.join();
+
+    EXPECT_FALSE(ring0_corrupted) << "Ring 0 current_task_index should be monotonic";
+    EXPECT_FALSE(ring1_corrupted) << "Ring 1 current_task_index should be monotonic";
+
+    // Final values should be kIterations for each ring (independently)
+    EXPECT_EQ(handle->header->rings[0].fc.current_task_index.load(), static_cast<int32_t>(kIterations));
+    EXPECT_EQ(handle->header->rings[1].fc.current_task_index.load(), static_cast<int32_t>(kIterations));
+}
+
+// Concurrent current_task_index increment: simulate orchestrator publishing task IDs
+TEST_F(SharedMemoryConcurrentTest, TaskIndexAtomicIncrement) {
+    constexpr int kIncrements = 5000;
+    constexpr int kThreads = 4;
+
+    auto &fc = handle->header->rings[0].fc;
+    fc.current_task_index.store(0, std::memory_order_relaxed);
+
+    auto incrementer = [&]() {
+        for (int i = 0; i < kIncrements; i++) {
+            fc.current_task_index.fetch_add(1, std::memory_order_acq_rel);
+        }
+    };
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i < kThreads; i++) {
+        threads.emplace_back(incrementer);
+    }
+    for (auto &t : threads)
+        t.join();
+
+    EXPECT_EQ(fc.current_task_index.load(), kIncrements * kThreads) << "Concurrent increments should not lose updates";
+}
+
+// Concurrent orchestrator_done and error code write: first-writer-wins semantics
+TEST_F(SharedMemoryConcurrentTest, OrchestratorDoneRace) {
+    constexpr int kRounds = 500;
+
+    for (int round = 0; round < kRounds; round++) {
+        handle->header->orchestrator_done.store(0, std::memory_order_relaxed);
+        handle->header->orch_error_code.store(0, std::memory_order_relaxed);
+
+        std::atomic<int> winners{0};
+
+        auto try_set_done = [&](int32_t error_code) {
+            int32_t expected = 0;
+            if (handle->header->orchestrator_done.compare_exchange_strong(
+                    expected, 1, std::memory_order_acq_rel, std::memory_order_acquire
+                )) {
+                handle->header->orch_error_code.store(error_code, std::memory_order_release);
+                winners.fetch_add(1, std::memory_order_relaxed);
+            }
+        };
+
+        std::thread t1(try_set_done, 100);
+        std::thread t2(try_set_done, 200);
+        std::thread t3(try_set_done, 300);
+        t1.join();
+        t2.join();
+        t3.join();
+
+        EXPECT_EQ(winners.load(), 1) << "Round " << round << ": exactly one thread should win the CAS";
+        EXPECT_EQ(handle->header->orchestrator_done.load(), 1);
+        int32_t code = handle->header->orch_error_code.load();
+        EXPECT_TRUE(code == 100 || code == 200 || code == 300)
+            << "Error code should be from one of the competing threads";
+    }
+}
+
+// Concurrent last_task_alive advancement: only forward movement
+TEST_F(SharedMemoryConcurrentTest, LastTaskAliveMonotonic) {
+    constexpr int kIterations = 10000;
+    constexpr int kThreads = 4;
+
+    auto &fc = handle->header->rings[0].fc;
+    fc.last_task_alive.store(0, std::memory_order_relaxed);
+
+    auto advancer = [&](int id) {
+        for (int i = 0; i < kIterations; i++) {
+            // CAS-based forward-only update
+            int32_t desired = id * kIterations + i + 1;
+            int32_t current = fc.last_task_alive.load(std::memory_order_acquire);
+            while (current < desired) {
+                if (fc.last_task_alive.compare_exchange_weak(
+                        current, desired, std::memory_order_acq_rel, std::memory_order_acquire
+                    )) {
+                    break;
+                }
+            }
+        }
+    };
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i < kThreads; i++) {
+        threads.emplace_back(advancer, i);
+    }
+    for (auto &t : threads)
+        t.join();
+
+    int32_t final_val = fc.last_task_alive.load();
+    // Should be at least the max of any thread's last write
+    EXPECT_GE(final_val, kIterations) << "last_task_alive should have advanced";
+}
+
+// Validate after concurrent modifications still reports corruption correctly
+TEST_F(SharedMemoryConcurrentTest, ValidateAfterConcurrentWrites) {
+    constexpr int kIterations = 1000;
+
+    // Concurrent writers update current_task_index within valid range
+    auto writer = [&](int ring) {
+        auto &fc = handle->header->rings[ring].fc;
+        for (int i = 0; i < kIterations; i++) {
+            fc.current_task_index.store(static_cast<int32_t>(i % 256), std::memory_order_release);
+        }
+    };
+
+    std::thread w0(writer, 0);
+    std::thread w1(writer, 1);
+    std::thread w2(writer, 2);
+    std::thread w3(writer, 3);
+    w0.join();
+    w1.join();
+    w2.join();
+    w3.join();
+
+    EXPECT_TRUE(pto2_sm_validate(handle)) << "Valid current_task_index values should pass validation";
+
+    // Corrupt one ring and verify detection
+    handle->header->rings[2].fc.current_task_index.store(-1, std::memory_order_relaxed);
+    EXPECT_FALSE(pto2_sm_validate(handle)) << "Corrupted current_task_index should fail validation";
+}
+
+// Double destroy: pto2_sm_destroy(NULL) is safe
+TEST_F(SharedMemoryConcurrentTest, DestroyNullIsSafe) {
+    pto2_sm_destroy(nullptr);  // Should not crash
+}
diff --git a/tests/ut/cpp/pto2_a2a3/test_core_types.cpp b/tests/ut/cpp/pto2_a2a3/test_core_types.cpp
new file mode 100644
index 000000000..b78a4d0b1
--- /dev/null
+++ b/tests/ut/cpp/pto2_a2a3/test_core_types.cpp
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Unit tests for core types in pto_runtime2_types.h
+ *
+ * Tests PTO2TaskId encoding, alignment assertions, utility macros, and
+ * task-state machine transitions / subtask-completion bitmask semantics.
+ */
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+
+#include "pto_runtime2_types.h"
+
+// =============================================================================
+// PTO2TaskId encoding/extraction
+// =============================================================================
+
+TEST(TaskId, DefaultIsZero) {
+    PTO2TaskId id{};
+    EXPECT_EQ(id.raw, 0u);
+    EXPECT_EQ(id.ring(), 0);
+    EXPECT_EQ(id.local(), 0u);
+}
+
+TEST(TaskId, MakeAndExtract) {
+    auto id = PTO2TaskId::make(2, 42);
+    EXPECT_EQ(id.ring(), 2);
+    EXPECT_EQ(id.local(), 42u);
+}
+
+TEST(TaskId, RingInUpperBits) {
+    auto id = PTO2TaskId::make(3, 0);
+    EXPECT_EQ(id.raw, static_cast<uint64_t>(3) << 32);
+    EXPECT_EQ(id.ring(), 3);
+    EXPECT_EQ(id.local(), 0u);
+}
+
+TEST(TaskId, MaxRingMaxLocal) {
+    auto id = PTO2TaskId::make(255, 0xFFFFFFFF);
+    EXPECT_EQ(id.ring(), 255);
+    EXPECT_EQ(id.local(), 0xFFFFFFFF);
+}
+
+TEST(TaskId, Roundtrip) {
+    for (uint8_t ring = 0; ring < PTO2_MAX_RING_DEPTH; ring++) {
+        for (uint32_t local : {0u, 1u, 100u, 0xFFFFu, 0xFFFFFFFFu}) {
+            auto id = PTO2TaskId::make(ring, local);
+            EXPECT_EQ(id.ring(), ring);
+            EXPECT_EQ(id.local(), local);
+        }
+    }
+}
+
+TEST(TaskId, Equality) {
+    auto a = PTO2TaskId::make(1, 42);
+    auto b = PTO2TaskId::make(1, 42);
+    auto c = PTO2TaskId::make(1, 43);
+    auto d = PTO2TaskId::make(2, 42);
+
+    EXPECT_TRUE(a == b);
+    EXPECT_FALSE(a != b);
+    EXPECT_TRUE(a != c);
+    EXPECT_TRUE(a != d);
+}
+
+TEST(TaskId, SizeIs8Bytes) { EXPECT_EQ(sizeof(PTO2TaskId), 8u); }
+
+// =============================================================================
+// PTO2TaskSlotState size (cache-line aligned)
+// =============================================================================
+
+// ABI contract: size must match shared memory layout (cache-line aligned).
+TEST(TaskSlotState, SizeIs64Bytes) { EXPECT_EQ(sizeof(PTO2TaskSlotState), 64u); }
+
+// =============================================================================
+// PTO2_ALIGN_UP macro
+// =============================================================================
+
+TEST(AlignUp, Zero) { EXPECT_EQ(PTO2_ALIGN_UP(0, 64), 0u); }
+
+TEST(AlignUp, AlreadyAligned) {
+    EXPECT_EQ(PTO2_ALIGN_UP(64, 64), 64u);
+    EXPECT_EQ(PTO2_ALIGN_UP(128, 64), 128u);
+}
+
+TEST(AlignUp, NotAligned) {
+    EXPECT_EQ(PTO2_ALIGN_UP(1, 64), 64u);
+    EXPECT_EQ(PTO2_ALIGN_UP(63, 64), 64u);
+    EXPECT_EQ(PTO2_ALIGN_UP(65, 64), 128u);
+}
+
+TEST(AlignUp, SmallAlignment) {
+    EXPECT_EQ(PTO2_ALIGN_UP(5, 4), 8u);
+    EXPECT_EQ(PTO2_ALIGN_UP(4, 4), 4u);
+    EXPECT_EQ(PTO2_ALIGN_UP(3, 4), 4u);
+}
+
+// =============================================================================
+// Task state enum values
+// =============================================================================
+
+// ABI contract: values must match wire protocol / shared memory layout.
+TEST(TaskState, EnumValues) {
+    EXPECT_EQ(PTO2_TASK_PENDING, 0);
+    EXPECT_EQ(PTO2_TASK_READY, 1);
+    EXPECT_EQ(PTO2_TASK_RUNNING, 2);
+    EXPECT_EQ(PTO2_TASK_COMPLETED, 3);
+    EXPECT_EQ(PTO2_TASK_CONSUMED, 4);
+}
+
+// =============================================================================
+// Error code constants
+// =============================================================================
+
+// ABI contract: values must match wire protocol / shared memory layout.
+TEST(ErrorCodes, Values) {
+    EXPECT_EQ(PTO2_ERROR_NONE, 0);
+    EXPECT_EQ(PTO2_ERROR_SCOPE_DEADLOCK, 1);
+    EXPECT_EQ(PTO2_ERROR_HEAP_RING_DEADLOCK, 2);
+    EXPECT_EQ(PTO2_ERROR_FLOW_CONTROL_DEADLOCK, 3);
+    EXPECT_EQ(PTO2_ERROR_DEP_POOL_OVERFLOW, 4);
+    EXPECT_EQ(PTO2_ERROR_INVALID_ARGS, 5);
+    EXPECT_EQ(PTO2_ERROR_SCHEDULER_TIMEOUT, 100);
+}
+
+// =============================================================================
+// Configuration constants
+// =============================================================================
+
+TEST(Config, TaskWindowSizeIsPowerOf2) {
+    EXPECT_GT(PTO2_TASK_WINDOW_SIZE, 0);
+    EXPECT_EQ(PTO2_TASK_WINDOW_SIZE & (PTO2_TASK_WINDOW_SIZE - 1), 0);
+}
+
+TEST(Config, MaxRingDepth) { EXPECT_EQ(PTO2_MAX_RING_DEPTH, 4); }
+
+TEST(Config, AlignSize) { EXPECT_EQ(PTO2_ALIGN_SIZE, 64); }
+
+// =============================================================================
+// Task state machine: valid transitions PENDING -> READY -> RUNNING ->
+// COMPLETED -> CONSUMED
+// =============================================================================
+
+TEST(TaskStateTest, ValidTransitions) {
+    PTO2TaskSlotState slot{};
+    slot.task_state.store(PTO2_TASK_PENDING);
+
+    PTO2TaskState expected = PTO2_TASK_PENDING;
+    bool ok = slot.task_state.compare_exchange_strong(expected, PTO2_TASK_READY);
+    EXPECT_TRUE(ok);
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_READY);
+
+    expected = PTO2_TASK_READY;
+    ok = slot.task_state.compare_exchange_strong(expected, PTO2_TASK_RUNNING);
+    EXPECT_TRUE(ok);
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_RUNNING);
+
+    expected = PTO2_TASK_RUNNING;
+    ok = slot.task_state.compare_exchange_strong(expected, PTO2_TASK_COMPLETED);
+    EXPECT_TRUE(ok);
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_COMPLETED);
+
+    expected = PTO2_TASK_COMPLETED;
+    ok = slot.task_state.compare_exchange_strong(expected, PTO2_TASK_CONSUMED);
+    EXPECT_TRUE(ok);
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_CONSUMED);
+}
+
+// Invalid transition: PENDING -> RUNNING (must go through READY)
+TEST(TaskStateTest, InvalidTransition_PendingToRunning) {
+    PTO2TaskSlotState slot{};
+    slot.task_state.store(PTO2_TASK_PENDING);
+
+    PTO2TaskState expected = PTO2_TASK_READY;
+    bool ok = slot.task_state.compare_exchange_strong(expected, PTO2_TASK_RUNNING);
+    EXPECT_FALSE(ok);
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_PENDING);
+}
+
+// Subtask completion bitmask
+TEST(TaskStateTest, SubtaskCompletion) {
+    PTO2TaskSlotState slot{};
+    slot.active_mask = PTO2_SUBTASK_MASK_AIC | PTO2_SUBTASK_MASK_AIV0 | PTO2_SUBTASK_MASK_AIV1;
+    slot.subtask_done_mask.store(0);
+
+    uint8_t prev = slot.subtask_done_mask.fetch_or(PTO2_SUBTASK_MASK_AIC);
+    EXPECT_EQ(prev, 0u);
+    EXPECT_NE(slot.subtask_done_mask.load() & slot.active_mask, slot.active_mask);
+
+    slot.subtask_done_mask.fetch_or(PTO2_SUBTASK_MASK_AIV0);
+    EXPECT_NE(slot.subtask_done_mask.load() & slot.active_mask, slot.active_mask);
+
+    slot.subtask_done_mask.fetch_or(PTO2_SUBTASK_MASK_AIV1);
+    EXPECT_EQ(slot.subtask_done_mask.load() & slot.active_mask, slot.active_mask);
+}
+
+// Fanin/fanout refcount correctness
+TEST(TaskStateTest, FaninRefcount) {
+    PTO2TaskSlotState slot{};
+    slot.fanin_count = 3;
+    slot.fanin_refcount.store(0);
+
+    for (int i = 0; i < 3; i++) {
+        slot.fanin_refcount.fetch_add(1);
+    }
+
+    EXPECT_EQ(slot.fanin_refcount.load(), slot.fanin_count);
+}
+
+TEST(TaskStateTest, FanoutRefcount) {
+    PTO2TaskSlotState slot{};
+    slot.fanout_count = 5;
+    slot.fanout_refcount.store(0);
+
+    for (int i = 0; i < 5; i++) {
+        slot.fanout_refcount.fetch_add(1);
+    }
+
+    EXPECT_EQ(slot.fanout_refcount.load(), slot.fanout_count);
+}
diff --git a/tests/ut/cpp/pto2_a2a3/test_coupling.cpp b/tests/ut/cpp/pto2_a2a3/test_coupling.cpp
new file mode 100644
index 000000000..40893eda0
--- /dev/null
+++ b/tests/ut/cpp/pto2_a2a3/test_coupling.cpp
@@ -0,0 +1,780 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Architectural coupling detection tests for TMR (tensormap_and_ringbuffer) runtime.
+ *
+ * These tests verify whether components can operate in isolation or require
+ * the full system to be initialized. Failures indicate tight coupling that
+ * makes unit testing and independent evolution difficult.
+ *
+ * Test philosophy: FAIL = coupling defect detected (expected for some tests).
+ */
+
+#include <gtest/gtest.h>
+#include <atomic>
+#include <cstring>
+#include <cstdlib>
+
+#include "pto_orchestrator.h"
+#include "pto_scheduler.h"
+#include "pto_tensormap.h"
+#include "pto_ring_buffer.h"
+#include "pto_shared_memory.h"
+#include "pto_runtime2_types.h"
+#include "pto_orchestration_api.h"  // for make_tensor_external (Tensor ctor is private)
+#include "tensor.h"
+
+// =============================================================================
+// Helper: Full TMR system init/destroy (measures what's needed)
+// =============================================================================
+
+static constexpr uint64_t TEST_HEAP_SIZE = 65536;
+static constexpr int32_t TEST_WINDOW_SIZE = 64;
+
+struct TMRSystem {
+    PTO2SharedMemoryHandle *sm = nullptr;
+    PTO2SchedulerState sched{};
+    PTO2OrchestratorState orch{};
+    uint8_t *gm_heap = nullptr;
+    bool sm_ok = false, sched_ok = false, orch_ok = false;
+
+    bool init(uint64_t heap_size = TEST_HEAP_SIZE, int32_t window_size = TEST_WINDOW_SIZE) {
+        sm = pto2_sm_create(window_size, heap_size);
+        if (!sm) return false;
+        sm_ok = true;
+
+        gm_heap = (uint8_t *)calloc(PTO2_MAX_RING_DEPTH, heap_size);
+        if (!gm_heap) return false;
+
+        if (!pto2_scheduler_init(&sched, sm->header)) return false;
+        sched_ok = true;
+
+        if (!pto2_orchestrator_init(&orch, sm->header, gm_heap, heap_size, 256)) return false;
+        orch_ok = true;
+
+        pto2_orchestrator_set_scheduler(&orch, &sched);
+        return true;
+    }
+
+    void destroy() {
+        if (orch_ok) pto2_orchestrator_destroy(&orch);
+        if (sched_ok) pto2_scheduler_destroy(&sched);
+        if (gm_heap) {
+            free(gm_heap);
+            gm_heap = nullptr;
+        }
+        if (sm_ok) pto2_sm_destroy(sm);
+    }
+};
+
+// Helper: create a minimal Tensor for TensorMap operations.
+// Tensor's default constructor is private; route through make_tensor_external.
+// The `addr` argument is reinterpreted as a fake pointer -- the TensorMap only
+// hashes the address and compares shapes, it never dereferences the buffer.
+static Tensor make_test_tensor(uint64_t addr, uint32_t ndims = 1, uint32_t shape0 = 100) {
+    uint32_t shapes[RUNTIME_MAX_TENSOR_DIMS] = {};
+    shapes[0] = shape0;
+    for (uint32_t i = 1; i < ndims; i++)
+        shapes[i] = 1;
+    return make_tensor_external(
+        reinterpret_cast<void *>(addr), shapes, ndims, DataType::FLOAT32, /*manual_dep=*/false, /*version=*/0
+    );
+}
+
+// =============================================================================
+// Suite 1: ComponentIsolation
+// =============================================================================
+
+TEST(ComponentIsolation, TensorMapWithoutOrchPointer) {
+    // TensorMap has an `orch` pointer field (set by orchestrator_init).
+    // Can we use TensorMap for insert + lookup without setting it?
+    int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {16, 16, 16, 16};
+    PTO2TensorMap tmap{};
+    ASSERT_TRUE(tmap.init(256, 1024, window_sizes));
+
+    // orch pointer is never set -- TensorMap is used standalone
+
+    // Insert should work
+    Tensor t = make_test_tensor(0x1000);
+    PTO2TaskId tid = PTO2TaskId::make(0, 0);
+    tmap.insert(t, tid);
+
+    // Lookup should work
+    PTO2LookupResult result;
+    tmap.lookup(t, result);
+    EXPECT_GE(
+        result.count, 1
+    ) << "TensorMap lookup works without orch pointer -- orch is unused for core insert/lookup operations";
+
+    tmap.destroy();
+}
+
+TEST(ComponentIsolation, TensorMapWithZeroWindowSizes) {
+    // Passing zero window sizes to TensorMap::init() should be rejected,
+    // but there's no validation.
+    int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {0, 0, 0, 0};
+    PTO2TensorMap tmap{};
+    // init calls malloc(0 * sizeof(ptr)) for task_entry_heads -- implementation-defined
+    bool ok = tmap.init(256, 1024, window_sizes);
+
+    if (ok) {
+        // If init succeeded, inserting should be unsafe because
+        // mask = (0 - 1) = 0xFFFFFFFF -- slot index would be OOB.
+        // This proves lack of input validation.
+        // We can't safely test insert, just document the gap.
+        SUCCEED() << "Zero window_size accepted without validation: "
+                     "insert would compute OOB slot index";
+        tmap.destroy();
+    } else {
+        // malloc(0) returned NULL on this platform
+        SUCCEED() << "init correctly failed with zero window_size (malloc(0) returned NULL)";
+    }
+}
+
+TEST(ComponentIsolation, DepPoolReclaimNeedsScheduler) {
+    // DepListPool::reclaim() takes PTO2SchedulerState& and accesses
+    // sched.ring_sched_states[ring_id].get_slot_state_by_task_id(sm_last_task_alive - 1)
+    // This couples DepPool to Scheduler internals.
+    PTO2DepListEntry entries[64];
+    memset(entries, 0, sizeof(entries));
+    std::atomic<int32_t> error_code{0};
+    PTO2DepListPool pool;
+    pool.init(entries, 64, &error_code);
+
+    // Allocate some entries to make top > 0
+    for (int i = 0; i < 10; i++) {
+        pool.alloc();
+    }
+
+    // To call reclaim, we need a PTO2SharedMemoryRingHeader.
+    // Create a minimal SM to get a valid ring header.
+    PTO2SharedMemoryHandle *sm = pto2_sm_create(TEST_WINDOW_SIZE, TEST_HEAP_SIZE);
+    ASSERT_NE(sm, nullptr);
+
+    // reclaim with sm_last_task_alive=0 should be a no-op (guard: sm_last_task_alive > 0)
+    pool.reclaim(sm->header->rings[0], 0);
+    SUCCEED() << "reclaim with last_task_alive=0 is a no-op";
+
+    // reclaim with sm_last_task_alive=PTO2_DEP_POOL_CLEANUP_INTERVAL would access
+    // sched.ring_sched_states[0].slot_states[...] which is nullptr
+    // This demonstrates the coupling: DepPool cannot reclaim without valid Scheduler state
+    // We can't safely call reclaim(sched, 0, 64) because it would dereference nullptr
+
+    // Document the coupling via signature inspection
+    SUCCEED() << "DepPool::reclaim() requires PTO2SharedMemoryRingHeader& -- "
+                 "cannot reclaim without valid shared memory ring header";
+
+    pto2_sm_destroy(sm);
+}
+
+TEST(ComponentIsolation, DepPoolEnsureSpaceSignatureCoupling) {
+    // ensure_space() requires BOTH PTO2SchedulerState& AND PTO2RingFlowControl&
+    // This couples DepPool to Scheduler + SharedMemory simultaneously
+    PTO2DepListEntry entries[256];
+    memset(entries, 0, sizeof(entries));
+    std::atomic<int32_t> error_code{0};
+    PTO2DepListPool pool;
+    pool.init(entries, 256, &error_code);
+
+    // With enough space, ensure_space returns immediately without accessing ring header
+    PTO2SharedMemoryHandle *sm = pto2_sm_create(TEST_WINDOW_SIZE, TEST_HEAP_SIZE);
+    ASSERT_NE(sm, nullptr);
+
+    pool.ensure_space(sm->header->rings[0], 5);  // available() = 255 >= 5 -- no-op
+    EXPECT_GE(pool.available(), 5) << "ensure_space returns immediately when space sufficient, "
+                                      "but signature still requires PTO2SharedMemoryRingHeader reference";
+
+    pto2_sm_destroy(sm);
+}
+
+TEST(ComponentIsolation, SchedulerConsumedPathAccessesSM) {
+    // check_and_handle_consumed -> advance_ring_pointers requires valid SM header.
+    // Build a minimal slot that would trigger the consumed path.
+    TMRSystem sys;
+    ASSERT_TRUE(sys.init());
+
+    auto &rs = sys.sched.ring_sched_states[0];
+    PTO2TaskSlotState &slot = sys.sm->header->rings[0].get_slot_state_by_slot(0);
+
+    // Set up a task that appears consumed
+    slot.fanout_count = 1;
+    slot.fanout_refcount.store(1, std::memory_order_relaxed);
+    slot.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_relaxed);
+    slot.ring_id = 0;
+
+    // Provide a valid task descriptor so advance_ring_pointers won't crash
+    PTO2TaskDescriptor dummy_desc{};
+    dummy_desc.packed_buffer_base = nullptr;
+    dummy_desc.packed_buffer_end = nullptr;
+    slot.task = &dummy_desc;
+
+    // Set current_task_index to 1 so advance_ring_pointers scans slot 0
+    sys.sm->header->rings[0].fc.current_task_index.store(1, std::memory_order_relaxed);
+
+    // This should work with valid SM, proving SM is required
+    sys.sched.check_and_handle_consumed(slot);
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_CONSUMED)
+        << "check_and_handle_consumed works only with valid SM handle -- "
+           "Scheduler->SharedMemory tight coupling confirmed";
+
+    sys.destroy();
+}
+
+TEST(ComponentIsolation, OrchestratorInitWithoutSM) {
+    // pto2_orchestrator_init dereferences sm_header->rings[r].fc immediately.
+    // Passing nullptr should crash (no null-check).
+    PTO2OrchestratorState orch{};
+    uint8_t heap[1024];
+
+    EXPECT_DEATH(pto2_orchestrator_init(&orch, nullptr, heap, 1024), ".*")
+        << "Orchestrator init does not validate sm_header != nullptr";
+}
+
+TEST(ComponentIsolation, TaskSlotStateStandalone) {
+    // TaskSlotState should be the one type that can be operated independently.
+    // Manually drive the full state machine.
+    alignas(64) PTO2TaskSlotState slot{};
+    slot.fanin_count = 2;
+    slot.fanout_count = 1;
+    slot.fanin_refcount.store(0, std::memory_order_relaxed);
+    slot.fanout_refcount.store(0, std::memory_order_relaxed);
+    slot.task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
+
+    // PENDING -> READY: fanin_refcount reaches fanin_count
+    slot.fanin_refcount.fetch_add(1, std::memory_order_relaxed);
+    slot.fanin_refcount.fetch_add(1, std::memory_order_relaxed);
+    EXPECT_EQ(slot.fanin_refcount.load(), slot.fanin_count);
+
+    PTO2TaskState expected_pending = PTO2_TASK_PENDING;
+    EXPECT_TRUE(slot.task_state.compare_exchange_strong(expected_pending, PTO2_TASK_READY));
+
+    // READY -> RUNNING
+    PTO2TaskState expected_ready = PTO2_TASK_READY;
+    EXPECT_TRUE(slot.task_state.compare_exchange_strong(expected_ready, PTO2_TASK_RUNNING));
+
+    // RUNNING -> COMPLETED
+    slot.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_relaxed);
+
+    // COMPLETED -> CONSUMED: fanout_refcount reaches fanout_count
+    slot.fanout_refcount.fetch_add(1, std::memory_order_relaxed);
+    EXPECT_EQ(slot.fanout_refcount.load(), slot.fanout_count);
+
+    PTO2TaskState expected_completed = PTO2_TASK_COMPLETED;
+    EXPECT_TRUE(slot.task_state.compare_exchange_strong(expected_completed, PTO2_TASK_CONSUMED))
+        << "TaskSlotState can be fully driven standalone -- good isolation";
+}
+
+TEST(ComponentIsolation, HeapRingWithLocalAtomics) {
+    // The standalone PTO2HeapRing/pto2_heap_ring_init API has been consolidated
+    // into PTO2TaskAllocator, which couples the heap and the task ring. There is
+    // no longer a way to exercise heap allocation in isolation with just local
+    // atomics -- you need a fully initialized allocator backed by SM pointers.
+    //
+    // This test is preserved as a documentation of the tightening of that
+    // coupling: heap alloc can no longer run independently of the task ring.
+    SUCCEED() << "PTO2HeapRing/pto2_heap_ring_init removed -- heap allocation is "
+                 "now embedded in PTO2TaskAllocator, which requires a task ring "
+                 "and SM-backed atomics. Heap allocation is no longer isolable.";
+}
+
+// =============================================================================
+// Suite 2: InitializationOrder
+// =============================================================================
+
+TEST(InitializationOrder, TensorMapInitWithGarbageWindowSizes) {
+    // If SM header is not initialized before TensorMap::init_default(),
+    // garbage window_sizes are read. Simulate this with large values.
+    int32_t garbage_sizes[PTO2_MAX_RING_DEPTH] = {-1, -1, -1, -1};
+    PTO2TensorMap tmap{};
+
+    // malloc(-1 * sizeof(ptr)) = malloc(huge) -- should fail
+    bool ok = tmap.init(256, 1024, garbage_sizes);
+    EXPECT_FALSE(ok) << "TensorMap::init with negative window_sizes should fail on malloc, "
+                        "but no explicit validation rejects negative values before malloc";
+
+    if (ok) tmap.destroy();
+}
+
+TEST(InitializationOrder, SchedulerInitWithZeroWindowSize) {
+    // If SM has task_window_size=0, scheduler creates arrays of size 0.
+    PTO2SharedMemoryHandle *sm = pto2_sm_create(0, TEST_HEAP_SIZE);
+
+    if (sm == nullptr) {
+        // pto2_sm_create rejects 0 window -- good validation
+        SUCCEED() << "pto2_sm_create rejects window_size=0";
+        return;
+    }
+
+    PTO2SchedulerState sched{};
+    uint8_t heap[TEST_HEAP_SIZE * PTO2_MAX_RING_DEPTH]{};
+    (void)heap;
+
+    bool ok = pto2_scheduler_init(&sched, sm->header);
+    if (ok) {
+        // task_window_mask = 0 - 1 = -1 (wraps to max uint)
+        // get_slot_state_by_task_id(0) would access slot_states[0 & (-1)] = slot_states[0]
+        // But slot_states was allocated with new PTO2TaskSlotState[0] -- zero-length!
+        EXPECT_EQ(sm->header->rings[0].task_window_size, 0u)
+            << "Zero window_size accepted: slot_states[0] is zero-length allocation, "
+               "any access is UB";
+        pto2_scheduler_destroy(&sched);
+    }
+
+    pto2_sm_destroy(sm);
+}
+
+TEST(InitializationOrder, OrchestratorDoubleInit) {
+    // Calling init twice without destroy leaks all first-init allocations.
+    TMRSystem sys;
+    ASSERT_TRUE(sys.init());
+
+    // Re-init without destroy -- old allocations are leaked
+    uint8_t extra_heap[TEST_HEAP_SIZE * PTO2_MAX_RING_DEPTH]{};
+    bool ok = pto2_orchestrator_init(&sys.orch, sys.sm->header, extra_heap, TEST_HEAP_SIZE, 256);
+    EXPECT_TRUE(ok) << "Double init succeeds -- no guard against re-initialization. "
+                       "First init's allocations are leaked";
+
+    // Clean up the second init
+    pto2_orchestrator_destroy(&sys.orch);
+
+    // First init's memory is leaked -- we can't free it anymore
+    // This is a documentation test: no re-init guard exists
+    sys.orch_ok = false;  // prevent double destroy
+    sys.destroy();
+}
+
+TEST(InitializationOrder, OrchestratorBeforeScheduler) {
+    // Init orchestrator without setting scheduler. scope_begin + scope_end should
+    // degrade gracefully (skip dependency tracking).
+    PTO2SharedMemoryHandle *sm = pto2_sm_create(TEST_WINDOW_SIZE, TEST_HEAP_SIZE);
+    ASSERT_NE(sm, nullptr);
+
+    uint8_t *heap = (uint8_t *)calloc(PTO2_MAX_RING_DEPTH, TEST_HEAP_SIZE);
+    ASSERT_NE(heap, nullptr);
+
+    PTO2OrchestratorState orch{};
+    ASSERT_TRUE(pto2_orchestrator_init(&orch, sm->header, heap, TEST_HEAP_SIZE, 256));
+
+    // scheduler is not set -- scope_begin/scope_end should not crash
+    pto2_scope_begin(&orch);
+    pto2_scope_end(&orch);
+    SUCCEED() << "scope_begin + scope_end work without scheduler (no crash). "
+                 "Tasks submitted in this scope have no dependency tracking.";
+
+    pto2_orchestrator_destroy(&orch);
+    free(heap);
+    pto2_sm_destroy(sm);
+}
+
+// =============================================================================
+// Suite 3: CrossComponentContract
+// =============================================================================
+
+TEST(CrossComponentContract, WindowSizeMismatch) {
+    // After the PTO2SharedMemoryRingHeader consolidation (#622), both scheduler
+    // and orchestrator read window_size from the same SM ring header pointer.
+    // Verify via the SM header: the single source of truth.
+    PTO2SharedMemoryHandle *sm = pto2_sm_create(TEST_WINDOW_SIZE, TEST_HEAP_SIZE);
+    ASSERT_NE(sm, nullptr);
+
+    uint8_t *heap = (uint8_t *)calloc(PTO2_MAX_RING_DEPTH, TEST_HEAP_SIZE);
+    ASSERT_NE(heap, nullptr);
+
+    // Initialize scheduler and orchestrator
+    PTO2SchedulerState sched{};
+    ASSERT_TRUE(pto2_scheduler_init(&sched, sm->header));
+
+    PTO2OrchestratorState orch{};
+    ASSERT_TRUE(pto2_orchestrator_init(&orch, sm->header, heap, TEST_HEAP_SIZE, 256));
+
+    // Both read from the same SM header -- verify the header value is correct
+    EXPECT_EQ(sm->header->rings[0].task_window_size, (uint64_t)TEST_WINDOW_SIZE)
+        << "SM ring header holds the authoritative window_size";
+
+    // Mutate SM header -- both components see the new value because they
+    // share the same ring header pointer
+    sm->header->rings[0].task_window_size = TEST_WINDOW_SIZE * 2;
+    EXPECT_EQ(sm->header->rings[0].task_window_size, (uint64_t)(TEST_WINDOW_SIZE * 2))
+        << "After RingHeader consolidation, mutation is visible to all components "
+           "through the shared ring header pointer -- independent-caching mismatch eliminated";
+
+    pto2_orchestrator_destroy(&orch);
+    pto2_scheduler_destroy(&sched);
+    free(heap);
+    pto2_sm_destroy(sm);
+}
+
+TEST(CrossComponentContract, FanoutCountManipulation) {
+    // fanout_count is set by orchestrator (+1 for scope), checked by scheduler.
+    // If we bypass the +1 initialization, check_and_handle_consumed fires immediately.
+    TMRSystem sys;
+    ASSERT_TRUE(sys.init());
+
+    auto &rs = sys.sched.ring_sched_states[0];
+    PTO2TaskSlotState &slot = sys.sm->header->rings[0].get_slot_state_by_slot(0);
+
+    PTO2TaskDescriptor dummy_desc{};
+    dummy_desc.packed_buffer_base = nullptr;
+    dummy_desc.packed_buffer_end = nullptr;
+    slot.task = &dummy_desc;
+    slot.ring_id = 0;
+
+    // Normal init: orchestrator sets fanout_count = 1 (scope ref)
+    // Here we bypass: set fanout_count = 0 directly
+    slot.fanout_count = 0;
+    slot.fanout_refcount.store(0, std::memory_order_relaxed);
+    slot.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_relaxed);
+
+    sys.sm->header->rings[0].fc.current_task_index.store(1, std::memory_order_relaxed);
+
+    // check_and_handle_consumed: fanout_refcount(0) == fanout_count(0) -> true -> CONSUMED
+    sys.sched.check_and_handle_consumed(slot);
+
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_CONSUMED)
+        << "fanout_count=0 causes premature CONSUMED transition -- "
+           "scheduler trusts orchestrator's fanout_count without validation";
+}
+
+TEST(CrossComponentContract, HeapTailBeyondTop) {
+    // Previously tested PTO2HeapRing::pto2_heap_ring_try_alloc with manually
+    // constructed top/tail atomics. PTO2HeapRing no longer exists as a
+    // free-standing component -- heap state (top/tail) is now encapsulated in
+    // PTO2TaskAllocator as local integers derived from task descriptors, not
+    // from externally writable atomics. An invalid tail>top state cannot be
+    // synthesized without a full allocator + scheduler setup, so this
+    // coupling-contract scenario is no longer reachable from a unit test.
+    SUCCEED() << "PTO2HeapRing removed; heap tail/top are now internal to "
+                 "PTO2TaskAllocator and derived from consumed task descriptors. "
+                 "No external atomic to corrupt -- this specific invariant is "
+                 "enforced by construction rather than by validation.";
+}
+
+TEST(CrossComponentContract, ActiveMaskZero) {
+    // active_mask=0 should never happen (orchestrator has always_assert).
+    // But scheduler's release_fanin_and_check_ready has no such guard.
+    alignas(64) PTO2TaskSlotState slot{};
+    slot.active_mask = 0;  // Invalid -- no subtask active
+    slot.fanin_count = 1;
+    slot.fanin_refcount.store(0, std::memory_order_relaxed);
+    slot.task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
+
+    PTO2ResourceShape shape = pto2_active_mask_to_shape(0);
+    // With mask=0: core_mask=0, popcount=0, no AIC bit -> falls through to AIV.
+    // The enum has been simplified to {AIC, AIV, MIX}; there is no longer a
+    // distinct AIV_X2 shape (multi-AIV tasks are all MIX).
+    EXPECT_EQ(static_cast<int>(shape), static_cast<int>(PTO2ResourceShape::AIV))
+        << "active_mask=0 maps to AIV -- incorrect shape routing. "
+           "Orchestrator guards with always_assert, but scheduler does not validate";
+}
+
+TEST(CrossComponentContract, TaskDescriptorNullInConsumedSlot) {
+    // Historically advance_ring_pointers dereferenced slot.task->packed_buffer_end
+    // to drive heap reclamation from the last consumed task. Heap reclamation
+    // has since moved into PTO2TaskAllocator::update_heap_tail (reached by the
+    // orchestrator on allocation), so advance_ring_pointers no longer touches
+    // slot.task at all -- it only walks task_state. The coupling this test was
+    // designed to surface has been removed by construction.
+    TMRSystem sys;
+    ASSERT_TRUE(sys.init());
+
+    auto &rs = sys.sched.ring_sched_states[0];
+    PTO2TaskSlotState &slot = sys.sm->header->rings[0].get_slot_state_by_slot(0);
+
+    // Mark as CONSUMED but leave task pointer as nullptr
+    slot.task_state.store(PTO2_TASK_CONSUMED, std::memory_order_relaxed);
+    slot.task = nullptr;  // Not initialized
+    slot.ring_id = 0;
+
+    sys.sm->header->rings[0].fc.current_task_index.store(1, std::memory_order_relaxed);
+
+    // Should no longer crash: advance_ring_pointers now only reads task_state.
+    rs.advance_ring_pointers();
+    EXPECT_EQ(rs.last_task_alive, 1) << "advance_ring_pointers no longer dereferences slot.task -- "
+                                        "scheduler/orchestrator heap-reclamation coupling removed";
+
+    sys.destroy();
+}
+
+// =============================================================================
+// Suite 4: StateLeakage
+// =============================================================================
+
+TEST(StateLeakage, HeapErrorCodeInvisibleToScheduler) {
+    // Orchestrator sets orch_error_code on fatal error.
+    // Scheduler's hot path does NOT check this error code.
+    TMRSystem sys;
+    ASSERT_TRUE(sys.init());
+
+    // Simulate orchestrator setting fatal error
+    sys.sm->header->orch_error_code.store(PTO2_ERROR_HEAP_RING_DEADLOCK, std::memory_order_release);
+
+    // Scheduler operations continue despite error:
+    // push to ready queue
+    auto &rs = sys.sched.ring_sched_states[0];
+    PTO2TaskSlotState &slot = sys.sm->header->rings[0].get_slot_state_by_slot(0);
+    slot.active_mask = PTO2_SUBTASK_MASK_AIV0;
+    PTO2ResourceShape shape = pto2_active_mask_to_shape(slot.active_mask);
+
+    bool pushed = sys.sched.ready_queues[static_cast<int>(shape)].push(&slot);
+    EXPECT_TRUE(pushed);
+
+    // pop from ready queue
+    PTO2TaskSlotState *popped = sys.sched.ready_queues[static_cast<int>(shape)].pop();
+    EXPECT_EQ(popped, &slot) << "Scheduler continues normal operation after orchestrator fatal error -- "
+                                "orch_error_code is one-directional (orch->host), invisible to scheduler hot path";
+
+    sys.destroy();
+}
+
+TEST(StateLeakage, HeadOfLineBlocking) {
+    // advance_ring_pointers scans linearly: stops at first non-CONSUMED slot.
+    // One incomplete task blocks reclamation of all subsequent CONSUMED tasks.
+    TMRSystem sys;
+    ASSERT_TRUE(sys.init());
+
+    auto &rs = sys.sched.ring_sched_states[0];
+    PTO2TaskDescriptor descs[3]{};
+    descs[0].packed_buffer_end = nullptr;
+    descs[1].packed_buffer_end = nullptr;
+    descs[2].packed_buffer_end = nullptr;
+
+    // Task 0: CONSUMED
+    PTO2TaskSlotState &slot0 = sys.sm->header->rings[0].get_slot_state_by_slot(0);
+    slot0.task_state.store(PTO2_TASK_CONSUMED, std::memory_order_relaxed);
+    slot0.task = &descs[0];
+
+    // Task 1: COMPLETED (NOT consumed -- fanout incomplete)
+    PTO2TaskSlotState &slot1 = sys.sm->header->rings[0].get_slot_state_by_slot(1);
+    slot1.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_relaxed);
+    slot1.task = &descs[1];
+
+    // Task 2: CONSUMED
+    PTO2TaskSlotState &slot2 = sys.sm->header->rings[0].get_slot_state_by_slot(2);
+    slot2.task_state.store(PTO2_TASK_CONSUMED, std::memory_order_relaxed);
+    slot2.task = &descs[2];
+
+    sys.sm->header->rings[0].fc.current_task_index.store(3, std::memory_order_relaxed);
+
+    rs.advance_ring_pointers();
+
+    // last_task_alive should stop at task 1 (COMPLETED, not CONSUMED)
+    EXPECT_EQ(rs.last_task_alive, 1) << "Head-of-line blocking: task 1 (COMPLETED) blocks reclamation of "
+                                        "task 2 (CONSUMED). Linear scan design couples reclamation rate "
+                                        "to the slowest consumer in the ring.";
+
+    sys.destroy();
+}
+
+TEST(StateLeakage, TensorMapCleanupInterval) {
+    // TensorMap cleanup is triggered every PTO2_TENSORMAP_CLEANUP_INTERVAL tasks.
+    // Between cleanups, stale entries accumulate in bucket chains, degrading lookup.
+    int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {256, 256, 256, 256};
+    PTO2TensorMap tmap{};
+    ASSERT_TRUE(tmap.init(256, 4096, window_sizes));
+
+    // Insert entries for tasks 0..99 (all same address = same bucket)
+    for (int i = 0; i < 100; i++) {
+        Tensor t = make_test_tensor(0x2000);
+        PTO2TaskId tid = PTO2TaskId::make(0, i);
+        tmap.insert(t, tid);
+    }
+
+    // Advance last_task_alive to 80 -- tasks 0..79 are stale
+    tmap.sync_validity(0, 80);
+
+    // Lookup must traverse all 100 entries (80 stale + 20 valid)
+    // because cleanup hasn't been triggered yet (need sync_tensormap, not just sync_validity)
+    PTO2LookupResult result;
+    Tensor query = make_test_tensor(0x2000);
+    tmap.lookup(query, result);
+
+    // Should find entries from tasks 80..99 = 20 valid
+    EXPECT_EQ(result.count, 16) << "Lookup result capped at PTO2_LOOKUP_MAX_RESULTS=16, but stale entries "
+                                   "still slow traversal. Cleanup interval ("
+                                << PTO2_TENSORMAP_CLEANUP_INTERVAL
+                                << " tasks) couples TensorMap performance to scheduler's CONSUMED advancement rate";
+
+    tmap.destroy();
+}
+
+TEST(StateLeakage, SubtaskMaskProtocol) {
+    // active_mask bits (AIC=0x1, AIV0=0x2, AIV1=0x4) are set by orchestrator
+    // and checked by scheduler's on_subtask_complete. There's no shared enum
+    // enforcing consistency -- just implicit agreement on bit positions.
+
+    // Orchestrator normalizes aiv1-only to aiv0:
+    // If only aiv1 set (0x4), it moves to aiv0 (0x2).
+    // Scheduler uses SubtaskSlot enum (AIC=0, AIV0=1, AIV1=2) for done_bit.
+
+    // Verify the normalization creates an implicit contract:
+    uint8_t mask_aiv1_only = PTO2_SUBTASK_MASK_AIV1;  // 0x4
+    // After orchestrator normalization: becomes PTO2_SUBTASK_MASK_AIV0 = 0x2
+    uint8_t normalized = PTO2_SUBTASK_MASK_AIV0;  // aiv1 moved to aiv0
+
+    // Scheduler completion path: on_subtask_complete with AIV0 slot sets bit 1
+    uint8_t done_bit = (1u << static_cast<uint8_t>(PTO2SubtaskSlot::AIV0));
+    EXPECT_EQ(done_bit, PTO2_SUBTASK_MASK_AIV0);
+
+    // But if scheduler receives completion for AIV1 slot (the physical source),
+    // it would set bit 2, which doesn't match normalized mask 0x2
+    uint8_t wrong_done_bit = (1u << static_cast<uint8_t>(PTO2SubtaskSlot::AIV1));
+    EXPECT_NE(wrong_done_bit, normalized)
+        << "Subtask mask protocol: orchestrator normalizes aiv1->aiv0 (mask 0x4->0x2), "
+           "but scheduler must dispatch to AIV0 slot (not AIV1). "
+           "If scheduler signals AIV1 completion, done_mask (0x4) != active_mask (0x2) -- "
+           "task never completes. No compile-time enforcement exists.";
+}
+
+// =============================================================================
+// Suite 5: CompileTimeCoupling
+// =============================================================================
+
+TEST(CompileTimeCoupling, OrchestratorInitDestroyCycle) {
+    // Orchestrator embeds rings, TensorMap, scope stack -- a large composite.
+    // Verify it can be initialized and destroyed cleanly multiple times,
+    // proving all sub-components are properly managed.
+    for (int cycle = 0; cycle < 3; cycle++) {
+        TMRSystem sys;
+        ASSERT_TRUE(sys.init()) << "Init cycle " << cycle;
+        sys.destroy();
+    }
+    SUCCEED() << "OrchestratorState init/destroy is clean across multiple cycles";
+}
+
+TEST(CompileTimeCoupling, MaxRingDepthPropagation) {
+    // PTO2_MAX_RING_DEPTH=4 is used across multiple components.
+    // Verify that the system initializes and operates correctly for all rings
+    // up to PTO2_MAX_RING_DEPTH, without probing internal array sizes.
+
+    // static_asserts on array sizes at the struct level are compile-time safety
+    // nets that belong in production headers, not in behavioral tests.
+    // This test verifies the functional consequence: all ring indices work.
+    PTO2SharedMemoryHandle *sm = pto2_sm_create(TEST_WINDOW_SIZE, TEST_HEAP_SIZE);
+    ASSERT_NE(sm, nullptr);
+
+    // Verify all rings are accessible through SM header
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        EXPECT_EQ(sm->header->rings[r].task_window_size, (uint64_t)TEST_WINDOW_SIZE)
+            << "Ring " << r << " should be initialized with correct window_size";
+    }
+
+    // TensorMap should accept inserts and lookups on all rings
+    int32_t window_sizes[PTO2_MAX_RING_DEPTH];
+    for (int i = 0; i < PTO2_MAX_RING_DEPTH; i++)
+        window_sizes[i] = TEST_WINDOW_SIZE;
+    PTO2TensorMap tmap{};
+    ASSERT_TRUE(tmap.init(256, 1024, window_sizes));
+
+    Tensor t = make_test_tensor(0x1000);
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        tmap.insert(t, PTO2TaskId::make(r, 0));
+    }
+
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(t, result);
+    EXPECT_EQ(result.count, PTO2_MAX_RING_DEPTH)
+        << "TensorMap supports inserts on all " << PTO2_MAX_RING_DEPTH << " rings";
+
+    tmap.destroy();
+    pto2_sm_destroy(sm);
+}
+
+TEST(CompileTimeCoupling, WindowSizeConsistencyAfterInit) {
+    // Verify that after full system init, all components operate correctly
+    // with the configured window_size by exercising the public API.
+    TMRSystem sys;
+    ASSERT_TRUE(sys.init());
+
+    // The authoritative window_size lives in the SM ring header
+    uint64_t expected_window = sys.sm->header->rings[0].task_window_size;
+    EXPECT_EQ(expected_window, (uint64_t)TEST_WINDOW_SIZE);
+
+    // Verify functional consistency: insert tasks up to window_size
+    // and confirm TensorMap, Orchestrator, and Scheduler all work correctly.
+    Tensor t = make_test_tensor(0x1000);
+    pto2_scope_begin(&sys.orch);
+
+    // Insert a tensor -- exercises Orchestrator + TensorMap
+    sys.orch.tensor_map.insert(t, PTO2TaskId::make(0, 0));
+
+    // Lookup -- exercises TensorMap with its window_size
+    PTO2LookupResult result;
+    result.count = 0;
+    sys.orch.tensor_map.lookup(t, result);
+    EXPECT_EQ(result.count, 1) << "TensorMap insert+lookup works with configured window_size";
+
+    pto2_scope_end(&sys.orch);
+
+    sys.destroy();
+}
+
+TEST(CompileTimeCoupling, TaskSlotStateLifecycleStandalone) {
+    // Verify TaskSlotState can be fully driven through its state machine
+    // without any other component -- proving it is the nexus type that
+    // both orchestrator and scheduler operate on.
+    alignas(64) PTO2TaskSlotState slot{};
+    slot.fanin_count = 2;
+    slot.fanout_count = 1;
+    slot.fanin_refcount.store(0, std::memory_order_relaxed);
+    slot.fanout_refcount.store(0, std::memory_order_relaxed);
+    slot.task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
+
+    // Drive full lifecycle: PENDING -> READY -> RUNNING -> COMPLETED -> CONSUMED
+    slot.fanin_refcount.fetch_add(1);
+    slot.fanin_refcount.fetch_add(1);
+    EXPECT_EQ(slot.fanin_refcount.load(), slot.fanin_count);
+
+    PTO2TaskState expected = PTO2_TASK_PENDING;
+    EXPECT_TRUE(slot.task_state.compare_exchange_strong(expected, PTO2_TASK_READY));
+
+    expected = PTO2_TASK_READY;
+    EXPECT_TRUE(slot.task_state.compare_exchange_strong(expected, PTO2_TASK_RUNNING));
+
+    slot.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_relaxed);
+
+    slot.fanout_refcount.fetch_add(1);
+    EXPECT_EQ(slot.fanout_refcount.load(), slot.fanout_count);
+
+    expected = PTO2_TASK_COMPLETED;
+    EXPECT_TRUE(slot.task_state.compare_exchange_strong(expected, PTO2_TASK_CONSUMED))
+        << "TaskSlotState can be fully driven standalone -- references types from "
+           "both orchestrator and scheduler domains but is independently operable";
+}
+
+TEST(CompileTimeCoupling, ReadyQueueAllShapesUsable) {
+    // PTO2_NUM_RESOURCE_SHAPES ready queues exist (one per shape).
+    // Verify all can be initialized and used for push/pop.
+    for (int s = 0; s < PTO2_NUM_RESOURCE_SHAPES; s++) {
+        PTO2ReadyQueue queue{};
+        ASSERT_TRUE(pto2_ready_queue_init(&queue, 16)) << "Shape " << s << " queue init failed";
+
+        PTO2TaskSlotState item{};
+        EXPECT_TRUE(queue.push(&item));
+        EXPECT_EQ(queue.pop(), &item);
+
+        pto2_ready_queue_destroy(&queue);
+    }
+}
+
+TEST(CompileTimeCoupling, LinkDependencyChain) {
+    // This test file links 5 runtime .cpp files:
+    // pto_orchestrator.cpp, pto_tensormap.cpp, pto_shared_memory.cpp,
+    // pto_ring_buffer.cpp, pto_scheduler.cpp
+    // This is because pto_tensormap.cpp includes pto_orchestrator.h (circular),
+    // which includes pto_scheduler.h, pto_ring_buffer.h, pto_shared_memory.h.
+    // Cannot compile TensorMap without linking the full runtime.
+    SUCCEED() << "test_coupling links 5 runtime .cpp files. "
+                 "Root cause: pto_tensormap.cpp #includes pto_orchestrator.h "
+                 "for sync_tensormap, creating a circular compile-unit dependency. "
+                 "This forces all tests that include TensorMap to also link "
+                 "Orchestrator, Scheduler, RingBuffer, and SharedMemory.";
+}
diff --git a/tests/ut/cpp/pto2_a2a3/test_coupling_stub.cpp b/tests/ut/cpp/pto2_a2a3/test_coupling_stub.cpp
new file mode 100644
index 000000000..022f4da2b
--- /dev/null
+++ b/tests/ut/cpp/pto2_a2a3/test_coupling_stub.cpp
@@ -0,0 +1,727 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Stub-based architectural coupling detection tests.
+ *
+ * This file deliberately excludes pto_orchestrator.cpp from the link.
+ * If it compiles and links successfully, that PROVES TensorMap + Scheduler +
+ * RingBuffer + SharedMemory can be used without the Orchestrator at link time.
+ *
+ * Key distinction probed here:
+ *   Link-time coupling    -- .o file has UND symbols pointing to another component
+ *   Compile-time coupling -- .cpp includes another component's header (type access)
+ *   Type-level coupling   -- function signature uses another component's struct type,
+ *                           forcing full include even if only a pointer is stored
+ *
+ * Test philosophy: document coupling depth precisely using stubs.
+ * FAIL = a coupling contract that the src violates or makes harder than necessary.
+ */
+
+#include <gtest/gtest.h>
+#include <atomic>
+#include <cstring>
+#include <cstdlib>
+#include <new>
+
+#include "pto_ring_buffer.h"
+#include "pto_scheduler.h"
+#include "pto_shared_memory.h"
+#include "pto_tensormap.h"
+#include "pto_runtime2_types.h"
+#include "tensor.h"
+// Only for make_tensor_external (inline, no link dependency on orchestrator.cpp).
+#include "pto_orchestration_api.h"
+
+// =============================================================================
+// Shared helpers
+// =============================================================================
+
+static constexpr uint64_t SH = 65536;  // heap size for sm_create
+static constexpr int32_t SW = 64;      // task window size
+
+// Minimal stub: allocate only the fields reclaim() reads.
+// Fields task_window_size/mask/slot_states now live on PTO2SharedMemoryRingHeader,
+// so we build a fake ring header on the heap.
+struct MinimalSchedStub {
+    PTO2SharedMemoryRingHeader ring_header{};
+    PTO2TaskSlotState *slot_array = nullptr;
+    static constexpr int32_t WINDOW = 64;
+
+    bool init(uint8_t /*ring_id*/ = 0) {
+        memset(&ring_header, 0, sizeof(ring_header));
+        slot_array = new (std::nothrow) PTO2TaskSlotState[WINDOW]{};
+        if (!slot_array) return false;
+        ring_header.slot_states = slot_array;
+        ring_header.task_window_size = WINDOW;
+        ring_header.task_window_mask = WINDOW - 1;
+        return true;
+    }
+
+    void destroy() {
+        delete[] slot_array;
+        slot_array = nullptr;
+    }
+};
+
+// Minimal pool helper: 512-entry DepListPool.
+struct SmallPool {
+    PTO2DepListEntry entries[512];
+    std::atomic<int32_t> error_code{0};
+    PTO2DepListPool pool;
+
+    void init() {
+        memset(entries, 0, sizeof(entries));
+        pool.init(entries, 512, &error_code);
+    }
+    int alloc_n(int n) {
+        int last = 0;
+        for (int i = 0; i < n; i++) {
+            auto *e = pool.alloc();
+            if (e) last = i + 1;
+        }
+        return last;
+    }
+};
+
+static Tensor make_tensor(uint64_t addr, uint32_t ndims = 1, uint32_t shape0 = 100) {
+    // Use make_tensor_external (inline header helper) since Tensor default
+    // constructor is private. The helper does not create any link-time
+    // dependency on pto_orchestrator.cpp.
+    uint32_t shapes[RUNTIME_MAX_TENSOR_DIMS] = {};
+    shapes[0] = shape0;
+    for (uint32_t i = 1; i < ndims; ++i)
+        shapes[i] = 1;
+    return make_tensor_external(
+        reinterpret_cast<void *>(static_cast<uintptr_t>(addr)), shapes, ndims, DataType::FLOAT32, /*manual_dep=*/false,
+        /*version=*/0
+    );
+}
+
+// =============================================================================
+// Suite 1: DepPoolStubIsolation
+// =============================================================================
+
+// sm_last_task_alive < PTO2_DEP_POOL_CLEANUP_INTERVAL: reclaim is a no-op.
+// A zero-initialized PTO2SharedMemoryRingHeader (slot_states=nullptr) must not crash.
+TEST(DepPoolStubIsolation, ReclaimBelowInterval_NeverAccessesScheduler) {
+    SmallPool sp;
+    sp.init();
+    sp.alloc_n(100);
+
+    // Capture used count BEFORE reclaim to compare after
+    int32_t used_before = sp.pool.used();
+
+    // Zero-init stub -- slot_states is nullptr
+    PTO2SharedMemoryRingHeader ring_hdr{};
+    memset(&ring_hdr, 0, sizeof(ring_hdr));
+
+    // sm_last_task_alive = interval - 1 -> guard `>= interval` is false -> no-op
+    int32_t below = PTO2_DEP_POOL_CLEANUP_INTERVAL - 1;
+    sp.pool.reclaim(ring_hdr, below);
+
+    // Pool unchanged -- reclaim was a no-op
+    EXPECT_EQ(sp.pool.used(), used_before)
+        << "reclaim() is a no-op when sm_last_task_alive < interval. "
+           "A fully zero-initialized (nullptr slot_states) PTO2SharedMemoryRingHeader "
+           "is safe to pass -- the struct is never touched.";
+}
+
+// sm_last_task_alive == PTO2_DEP_POOL_CLEANUP_INTERVAL: reclaim reads exactly
+//   ring_header.slot_states[(interval-1) & mask].dep_pool_mark
+// Stub provides only those three values; all other fields remain zero.
+TEST(DepPoolStubIsolation, ReclaimAtInterval_OnlyNeedsSlotArrayAndMask) {
+    SmallPool sp;
+    sp.init();
+    sp.alloc_n(100);  // top = 100, tail = 0
+
+    MinimalSchedStub stub;
+    ASSERT_TRUE(stub.init(0));
+
+    // Set dep_pool_mark in the slot reclaim() will read
+    int32_t sm_last = PTO2_DEP_POOL_CLEANUP_INTERVAL;         // e.g. 64
+    int32_t target_slot = (sm_last - 1) & (stub.WINDOW - 1);  // (63) & 63 = 63
+    stub.slot_array[target_slot].dep_pool_mark = 50;
+
+    sp.pool.reclaim(stub.ring_header, sm_last);
+
+    // reclaim should advance pool tail so used count drops (from 100 to 51)
+    EXPECT_EQ(sp.pool.used(), 51) << "reclaim() reads EXACTLY THREE values from PTO2SharedMemoryRingHeader:\n"
+                                     "  1. slot_states  (the pointer)\n"
+                                     "  2. task_window_mask\n"
+                                     "  3. slot_states[(sm_last-1) & mask].dep_pool_mark\n"
+                                     "All other fields of PTO2SharedMemoryRingHeader are unused.";
+
+    stub.destroy();
+}
+
+// ensure_space() returns immediately when available() >= needed.
+// PTO2SharedMemoryRingHeader is never accessed in the fast path.
+TEST(DepPoolStubIsolation, EnsureSpaceWithSufficientCapacity_NoSchedulerAccess) {
+    SmallPool sp;
+    sp.init();
+    // Pool is empty: available() = capacity - 1 = 511 >> needed = 5
+
+    PTO2SharedMemoryRingHeader ring_hdr{};
+    memset(&ring_hdr, 0, sizeof(ring_hdr));  // slot_states = nullptr (would crash if accessed)
+
+    // Should return immediately without touching ring_hdr internals
+    sp.pool.ensure_space(ring_hdr, 5);
+
+    EXPECT_GE(
+        sp.pool.available(), 5
+    ) << "ensure_space() exits immediately when available() >= needed. "
+         "Zero-initialized ring header (slot_states=nullptr) is safe -- never dereferenced. "
+         "The signature requires PTO2SharedMemoryRingHeader& "
+         "but it is not accessed in the fast path.";
+}
+
+// Document the sizeof cost: reclaim now takes PTO2SharedMemoryRingHeader which
+// directly contains the three needed fields -- coupling is significantly reduced.
+TEST(DepPoolStubIsolation, ReclaimRequiresExactlyThreeFields_NowOnRingHeader) {
+    // Fields actually needed by reclaim():
+    //   PTO2SharedMemoryRingHeader::slot_states       (8 bytes, pointer)
+    //   PTO2SharedMemoryRingHeader::task_window_mask   (4 bytes, int32_t)
+    //   PTO2TaskSlotState::dep_pool_mark               (4 bytes, int32_t)
+    // Total minimum: 16 bytes of live data.
+    size_t needed_bytes = sizeof(PTO2TaskSlotState *) + sizeof(int32_t) + sizeof(int32_t);
+
+    // Actual cost imposed by PTO2SharedMemoryRingHeader:
+    size_t actual_bytes = sizeof(PTO2SharedMemoryRingHeader);
+
+    EXPECT_GT(actual_bytes, needed_bytes) << "reclaim() needs ~16 bytes of data but requires passing "
+                                             "PTO2SharedMemoryRingHeader ("
+                                          << actual_bytes
+                                          << " bytes). "
+                                             "Ratio: "
+                                          << (actual_bytes / needed_bytes) << "x over-coupling.";
+
+    // Also report the exact sizes for documentation
+    SUCCEED() << "sizeof(PTO2SharedMemoryRingHeader) = " << actual_bytes << " bytes\n"
+              << "sizeof(PTO2TaskSlotState*) + 2*int32_t = " << needed_bytes << " bytes\n"
+              << "sizeof(PTO2TaskSlotState) = " << sizeof(PTO2TaskSlotState);
+}
+
+// =============================================================================
+// Suite 2: SchedulerWithoutOrchestrator
+// =============================================================================
+
+// Scheduler can be fully initialized and destroyed without any orchestrator code.
+// This test links pto_scheduler.cpp + pto_shared_memory.cpp only.
+TEST(SchedulerWithoutOrchestrator, InitAndDestroy_NoOrchestratorNeeded) {
+    PTO2SharedMemoryHandle *sm = pto2_sm_create(SW, SH);
+    ASSERT_NE(sm, nullptr);
+
+    uint8_t *heap = (uint8_t *)calloc(PTO2_MAX_RING_DEPTH, SH);
+    ASSERT_NE(heap, nullptr);
+
+    PTO2SchedulerState sched{};
+    bool ok = pto2_scheduler_init(&sched, sm->header);
+    EXPECT_TRUE(ok) << "pto2_scheduler_init succeeds without orchestrator.cpp in the link. "
+                       "Scheduler is link-time isolated from Orchestrator.";
+
+    EXPECT_EQ(sm->header->rings[0].task_window_size, (uint64_t)SW);
+    EXPECT_EQ(sm->header->rings[0].task_window_mask, SW - 1);
+
+    pto2_scheduler_destroy(&sched);
+    free(heap);
+    pto2_sm_destroy(sm);
+}
+
+// PTO2ReadyQueue is header-only (all methods are inline in pto_scheduler.h).
+// It needs zero .cpp linkage -- only pto_runtime2_types.h for slot type.
+TEST(SchedulerWithoutOrchestrator, ReadyQueue_StandaloneNoExternalDeps) {
+    PTO2ReadyQueue q;
+    pto2_ready_queue_init(&q, 64);
+
+    alignas(64) PTO2TaskSlotState slot{};
+    slot.task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
+
+    EXPECT_TRUE(q.push(&slot));
+    PTO2TaskSlotState *out = q.pop();
+    EXPECT_EQ(out, &slot) << "PTO2ReadyQueue push/pop are entirely header-inline (zero link deps). "
+                             "However, pto2_ready_queue_init / pto2_ready_queue_destroy are free "
+                             "functions defined in pto_scheduler.cpp -- even a standalone ReadyQueue "
+                             "requires linking pto_scheduler.cpp for lifecycle management. "
+                             "Push/pop core logic is self-contained; init/destroy coupling is avoidable.";
+
+    pto2_ready_queue_destroy(&q);
+}
+
+// release_fanin_and_check_ready requires zero TensorMap or Orchestrator linkage.
+// With fanin_count=1, one call makes new_refcount == fanin_count -> push to queue.
+TEST(SchedulerWithoutOrchestrator, ReleaseFanin_PushesWhenFaninMet) {
+    PTO2SharedMemoryHandle *sm = pto2_sm_create(SW, SH);
+    ASSERT_NE(sm, nullptr);
+    uint8_t *heap = (uint8_t *)calloc(PTO2_MAX_RING_DEPTH, SH);
+    ASSERT_NE(heap, nullptr);
+    PTO2SchedulerState sched{};
+    ASSERT_TRUE(pto2_scheduler_init(&sched, sm->header));
+
+    alignas(64) PTO2TaskSlotState slot{};
+    slot.fanin_count = 1;
+    slot.fanin_refcount.store(0, std::memory_order_relaxed);
+    slot.task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
+    slot.active_mask = PTO2_SUBTASK_MASK_AIV0;
+
+    bool became_ready = sched.release_fanin_and_check_ready(slot, nullptr);
+    EXPECT_TRUE(became_ready) << "fanin_count=1, one release -> task is ready";
+
+    // Verify the slot is now in the ready queue
+    PTO2ResourceShape shape = pto2_active_mask_to_shape(slot.active_mask);
+    PTO2TaskSlotState *popped = sched.ready_queues[static_cast<int>(shape)].pop();
+    EXPECT_EQ(popped, &slot) << "Slot found in ready queue -- no Orchestrator involvement";
+
+    pto2_scheduler_destroy(&sched);
+    free(heap);
+    pto2_sm_destroy(sm);
+}
+
+// DESIGN CONTRACT: non-profiling release_fanin_and_check_ready pushes to the
+// ready queue WITHOUT issuing an extra CAS(PENDING->READY) on task_state.
+// The profiling overload (pto_scheduler.h:803-825) performs the CAS purely
+// to be counted in atomic_count; correctness in either build comes from
+// fanin_refcount.fetch_add -- only the decrementer that observes
+// new_refcount == fanin_count pushes the slot, so the ready-queue invariant
+// is preserved even while task_state remains PENDING. This test pins the
+// non-profiling behavior so future edits can't silently add overhead.
+TEST(SchedulerWithoutOrchestrator, NonProfiling_ReleaseFanin_DoesNotCAS_TaskState) {
+#if PTO2_SCHED_PROFILING
+    GTEST_SKIP() << "Test only applies to non-profiling builds (PTO2_SCHED_PROFILING=0)";
+#endif
+    PTO2SharedMemoryHandle *sm = pto2_sm_create(SW, SH);
+    ASSERT_NE(sm, nullptr);
+    uint8_t *heap = (uint8_t *)calloc(PTO2_MAX_RING_DEPTH, SH);
+    ASSERT_NE(heap, nullptr);
+    PTO2SchedulerState sched{};
+    ASSERT_TRUE(pto2_scheduler_init(&sched, sm->header));
+
+    alignas(64) PTO2TaskSlotState slot{};
+    slot.fanin_count = 1;
+    slot.fanin_refcount.store(0, std::memory_order_relaxed);
+    slot.task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
+    slot.active_mask = PTO2_SUBTASK_MASK_AIV0;
+
+    sched.release_fanin_and_check_ready(slot, nullptr);
+
+    PTO2TaskState state_after = slot.task_state.load(std::memory_order_acquire);
+
+    // Design contract: non-profiling path does not mutate task_state here.
+    // Dispatch correctness relies on fanin_refcount's atomic fetch_add, not
+    // on the task_state value at push time.
+    EXPECT_EQ(state_after, PTO2_TASK_PENDING) << "Non-profiling release_fanin_and_check_ready must not CAS task_state; "
+                                                 "the profiling overload's CAS exists only for atomic-op counting.";
+
+    pto2_scheduler_destroy(&sched);
+    free(heap);
+    pto2_sm_destroy(sm);
+}
+
+// on_mixed_task_complete transitions COMPLETED->CONSUMED with a minimal stub descriptor.
+// No TensorMap or Orchestrator calls are made in this path.
+TEST(SchedulerWithoutOrchestrator, OnMixedTaskComplete_StubDescriptor) {
+    PTO2SharedMemoryHandle *sm = pto2_sm_create(SW, SH);
+    ASSERT_NE(sm, nullptr);
+    uint8_t *heap = (uint8_t *)calloc(PTO2_MAX_RING_DEPTH, SH);
+    ASSERT_NE(heap, nullptr);
+    PTO2SchedulerState sched{};
+    ASSERT_TRUE(pto2_scheduler_init(&sched, sm->header));
+
+    auto &rs = sched.ring_sched_states[0];
+    PTO2TaskSlotState &slot = sm->header->rings[0].get_slot_state_by_slot(0);
+
+    PTO2TaskDescriptor dummy_desc{};
+    dummy_desc.packed_buffer_base = nullptr;
+    dummy_desc.packed_buffer_end = nullptr;
+    slot.task = &dummy_desc;
+    slot.ring_id = 0;
+    slot.fanout_count = 1;
+    slot.fanout_refcount.store(1, std::memory_order_relaxed);
+    slot.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_relaxed);
+    sm->header->rings[0].fc.current_task_index.store(1, std::memory_order_relaxed);
+
+    sched.check_and_handle_consumed(slot);
+
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_CONSUMED)
+        << "Scheduler's COMPLETED->CONSUMED path requires only a stub "
+           "PTO2TaskDescriptor (packed_buffer pointers can be nullptr). "
+           "No TensorMap or Orchestrator calls are made in this path.";
+
+    pto2_scheduler_destroy(&sched);
+    free(heap);
+    pto2_sm_destroy(sm);
+}
+
+// =============================================================================
+// Suite 3: TensorMapLinkDecoupling
+// =============================================================================
+
+// This entire file excludes pto_orchestrator.cpp from the link.
+// If TensorMap init/insert/lookup work here, it proves link-time isolation.
+TEST(TensorMapLinkDecoupling, BuildsAndRunsWithoutOrchestratorCpp) {
+    int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {64, 64, 64, 64};
+    PTO2TensorMap tmap{};
+    ASSERT_TRUE(tmap.init(256, 1024, window_sizes));
+
+    Tensor t = make_tensor(0x3000);
+    PTO2TaskId tid = PTO2TaskId::make(0, 0);
+    tmap.insert(t, tid);
+
+    PTO2LookupResult result;
+    tmap.lookup(t, result);
+    EXPECT_GE(result.count, 1) << "TensorMap insert+lookup work without pto_orchestrator.cpp in the link.\n"
+                                  "Root cause: pto_tensormap.cpp includes pto_orchestrator.h (line 22) but\n"
+                                  "calls ZERO orchestrator functions -- confirmed by objdump UND analysis.\n"
+                                  "The include only provides the PTO2OrchestratorState type definition,\n"
+                                  "which is stored as PTO2OrchestratorState* (pointer -- forward decl suffices).";
+
+    tmap.destroy();
+}
+
+// Explicitly set orch = nullptr, then run insert and lookup.
+// If orch were dereferenced in the hot path, this would crash.
+TEST(TensorMapLinkDecoupling, OrchPointer_NeverDereferencedInHotPath) {
+    int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {64, 64, 64, 64};
+    PTO2TensorMap tmap{};
+    ASSERT_TRUE(tmap.init(256, 1024, window_sizes));
+    tmap.orch = nullptr;  // explicitly clear
+
+    Tensor t1 = make_tensor(0x4000, 1, 200);
+    Tensor t2 = make_tensor(0x5000, 1, 100);
+    PTO2TaskId t1id = PTO2TaskId::make(0, 0);
+    PTO2TaskId t2id = PTO2TaskId::make(0, 1);
+    tmap.insert(t1, t1id);
+    tmap.insert(t2, t2id);
+
+    PTO2LookupResult r;
+    tmap.lookup(t1, r);
+    EXPECT_GE(r.count, 1) << "orch=nullptr does not crash insert or lookup. "
+                             "The orch pointer is only used by sync_tensormap (called from orchestrator). "
+                             "In normal usage: orch is set by pto2_orchestrator_init, "
+                             "but insert/lookup never touch it.";
+
+    tmap.destroy();
+}
+
+// sync_tensormap only advances the cleanup clock -- it doesn't access orch.
+// Calling it with orch=nullptr is safe.
+TEST(TensorMapLinkDecoupling, SyncTensormap_DoesNotAccessOrch) {
+    int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {64, 64, 64, 64};
+    PTO2TensorMap tmap{};
+    ASSERT_TRUE(tmap.init(256, 1024, window_sizes));
+    tmap.orch = nullptr;
+
+    // Insert entries for tasks 0..63 in ring 0
+    for (int i = 0; i < 64; i++) {
+        Tensor t = make_tensor(0x6000 + i * 64);
+        tmap.insert(t, PTO2TaskId::make(0, i));
+    }
+
+    // Advance validity: tasks 0..31 are now retired
+    tmap.sync_validity(0, 32);
+
+    // sync_tensormap only calls sync_validity internally -- no orch access
+    tmap.sync_tensormap(PTO2TaskId::make(0, 0), 32);
+
+    // Valid count should reflect only tasks 32..63
+    int valid = tmap.valid_count();
+    EXPECT_LE(valid, 64) << "sync_tensormap(ring_id, last_alive) is purely time-advance logic. "
+                            "No dereference of orch pointer. "
+                            "Cleanup path is independent of OrchestratorState.";
+
+    tmap.destroy();
+}
+
+// Document the transitive include chain caused by one unnecessary #include.
+TEST(TensorMapLinkDecoupling, IncludeCost_OnePointerField_FullRuntimeHeaders) {
+    // pto_tensormap.cpp includes pto_orchestrator.h for PTO2OrchestratorState* orch.
+    // A forward declaration "struct PTO2OrchestratorState;" would be sufficient
+    // because orch is a raw pointer and is never dereferenced in tensormap.cpp.
+    //
+    // Cost of the full include:
+    //   pto_orchestrator.h includes:
+    //     -> pto_scheduler.h -> pto_ring_buffer.h -> pto_shared_memory.h
+    //     -> pto_runtime2_types.h -> pto_types.h, pto_submit_types.h, pto2_dispatch_payload.h
+    //
+    // Every TensorMap compilation unit pulls in the entire runtime header tree
+    // for a single pointer field.
+
+    // Verify: PTO2TensorMap::orch is a raw pointer (not embedded object)
+    EXPECT_EQ(sizeof(PTO2OrchestratorState *), sizeof(void *))
+        << "PTO2OrchestratorState* is a pointer -- sizeof(void*) bytes. "
+           "A forward declaration suffices. "
+           "The full include of pto_orchestrator.h transitively pulls in "
+           "pto_scheduler.h + pto_ring_buffer.h + pto_shared_memory.h + "
+           "pto_runtime2_types.h (7+ headers) for a single 8-byte pointer field.";
+
+    // Also: this test file compiles and links without pto_orchestrator.cpp --
+    // further confirming the include is header-only compile-time coupling.
+    SUCCEED() << "This test file does not link pto_orchestrator.cpp. "
+                 "Build success = confirmed link-time isolation.";
+}
+
+// =============================================================================
+// Suite 4: CompileTimeIncludeCoupling
+// =============================================================================
+
+// pto_ring_buffer.cpp's DepPool::reclaim takes PTO2SharedMemoryRingHeader& directly.
+// ring_buffer.o has ZERO UND symbols from scheduler -- type-level coupling is resolved.
+// The coupling is now to PTO2SharedMemoryRingHeader: accessing struct fields inline.
+TEST(CompileTimeIncludeCoupling, RingBufferCoupledToSharedMemoryAtTypeLevel) {
+    // Demonstrate: DepPool::reclaim is in pto_ring_buffer.cpp (not scheduler)
+    // and it accesses PTO2SharedMemoryRingHeader internal fields inline.
+    // This means: changing PTO2SharedMemoryRingHeader layout silently breaks ring_buffer
+    // without any API change or linker error.
+
+    // Cross-check: the field offset in the stub must match the real struct.
+    MinimalSchedStub stub;
+    ASSERT_TRUE(stub.init(0));
+
+    // Write to dep_pool_mark via stub's slot_array
+    stub.slot_array[63].dep_pool_mark = 99;
+
+    // Read the same field through PTO2SharedMemoryRingHeader's accessor
+    int32_t mark = stub.ring_header.get_slot_state_by_task_id(63).dep_pool_mark;
+    EXPECT_EQ(mark, 99) << "ring_buffer.cpp accesses PTO2SharedMemoryRingHeader::slot_states "
+                           "inline (no virtual dispatch, no function call). "
+                           "Changing the layout of PTO2TaskSlotState or PTO2SharedMemoryRingHeader breaks "
+                           "pto_ring_buffer.cpp without touching any function signature or .h file API. "
+                           "This is a hidden structural coupling: invisible to the linker.";
+
+    stub.destroy();
+}
+
+// Both Scheduler and TensorMap independently compute the same slot index formula.
+// Duplication means if one changes, the other silently diverges.
+TEST(CompileTimeIncludeCoupling, TaskWindowMask_DuplicatedInTwoComponents) {
+    // Scheduler formula (pto_scheduler.h:301):
+    //   slot_states[local_id & task_window_mask]
+    // TensorMap formula (pto_tensormap.h:~364):
+    //   local_id & (task_window_sizes[ring_id] - 1)
+    // Both assume power-of-2 window_size; neither validates it.
+
+    int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {64, 64, 64, 64};
+    PTO2TensorMap tmap{};
+    ASSERT_TRUE(tmap.init(256, 1024, window_sizes));
+
+    PTO2SharedMemoryHandle *sm = pto2_sm_create(64, SH);
+    ASSERT_NE(sm, nullptr);
+    uint8_t *heap = (uint8_t *)calloc(PTO2_MAX_RING_DEPTH, SH);
+    ASSERT_NE(heap, nullptr);
+    PTO2SchedulerState sched{};
+    ASSERT_TRUE(pto2_scheduler_init(&sched, sm->header));
+
+    // Verify both agree for local_id = 37, ring = 0
+    int32_t local_id = 37;
+    int32_t sched_slot = local_id & sm->header->rings[0].task_window_mask;
+    int32_t tmap_slot = local_id & (tmap.task_window_sizes[0] - 1);
+
+    EXPECT_EQ(sched_slot, tmap_slot) << "Scheduler slot = local_id & mask = " << sched_slot
+                                     << "\n"
+                                        "TensorMap slot = local_id & (size-1) = "
+                                     << tmap_slot
+                                     << "\n"
+                                        "Currently agree -- but the formula is written twice, in two components, "
+                                        "with no shared utility. A change to one (e.g., non-power-of-2 support) "
+                                        "would not automatically update the other.";
+
+    pto2_scheduler_destroy(&sched);
+    free(heap);
+    pto2_sm_destroy(sm);
+    tmap.destroy();
+}
+
+// PTO2_MAX_RING_DEPTH propagates into fixed-size arrays in 4 components.
+// Changing it requires recompiling all 4 components simultaneously.
+TEST(CompileTimeIncludeCoupling, MaxRingDepthInFourComponents) {
+    // 1. Orchestrator: rings[PTO2_MAX_RING_DEPTH]  (visible via TMRSystem)
+    // 2. Scheduler: ring_sched_states[PTO2_MAX_RING_DEPTH]
+    static_assert(
+        sizeof(PTO2SchedulerState::ring_sched_states) / sizeof(PTO2SchedulerState::RingSchedState) ==
+            PTO2_MAX_RING_DEPTH,
+        "Scheduler array size must equal PTO2_MAX_RING_DEPTH"
+    );
+
+    // 3. SharedMemory: header->rings[PTO2_MAX_RING_DEPTH]
+    static_assert(
+        sizeof(PTO2SharedMemoryHeader::rings) / sizeof(PTO2SharedMemoryRingHeader) == PTO2_MAX_RING_DEPTH,
+        "SharedMemory array size must equal PTO2_MAX_RING_DEPTH"
+    );
+
+    // 4. TensorMap: task_entry_heads[], task_window_sizes[], last_task_alives[]
+    PTO2TensorMap dummy{};
+    EXPECT_EQ(sizeof(dummy.task_entry_heads) / sizeof(dummy.task_entry_heads[0]), (size_t)PTO2_MAX_RING_DEPTH);
+    EXPECT_EQ(sizeof(dummy.task_window_sizes) / sizeof(dummy.task_window_sizes[0]), (size_t)PTO2_MAX_RING_DEPTH);
+    EXPECT_EQ(sizeof(dummy.last_task_alives) / sizeof(dummy.last_task_alives[0]), (size_t)PTO2_MAX_RING_DEPTH);
+
+    SUCCEED() << "PTO2_MAX_RING_DEPTH=" << PTO2_MAX_RING_DEPTH
+              << " is baked into fixed arrays in Scheduler, SharedMemory, and TensorMap. "
+                 "Changing this constant requires recompiling ALL 4 components. "
+                 "No runtime configurability exists.";
+}
+
+// Including pto_scheduler.h transitively pulls in the entire runtime type hierarchy.
+// Document the breadth of this coupling for a single component include.
+TEST(CompileTimeIncludeCoupling, SchedulerHeaderTransitiveIncludes) {
+    // #include "pto_scheduler.h" causes:
+    //   pto_scheduler.h -> pto_runtime2_types.h  (task state, config constants)
+    //                   -> pto_shared_memory.h   (SM handle, ring headers, flow control)
+    //                       -> pto_runtime2_types.h (again, guarded)
+    //                   -> pto_ring_buffer.h     (TaskAllocator, FaninPool, DepPool, RingSet)
+    //                       -> pto_shared_memory.h (again, guarded)
+    //                   -> common/core_type.h    (CoreType enum)
+    // Total headers transitively included: 6+
+
+    // Verify a few types from the transitive chain are available in this TU
+    // (these would be missing if the includes were broken)
+    PTO2TaskAllocator ta{};                // from pto_ring_buffer.h (consolidated TaskRing + HeapRing)
+    PTO2SharedMemoryHeader smh{};          // from pto_shared_memory.h
+    PTO2TaskState ts = PTO2_TASK_PENDING;  // from pto_runtime2_types.h
+    (void)ta;
+    (void)smh;
+    (void)ts;
+
+    SUCCEED() << "A single #include \"pto_scheduler.h\" makes available: "
+                 "PTO2TaskAllocator, PTO2FaninPool, PTO2DepListPool, "
+                 "PTO2SharedMemoryHandle, PTO2TaskSlotState, PTO2TaskState, "
+                 "PTO2ReadyQueue, CoreType -- the entire runtime type set. "
+                 "This creates a broad compile-time coupling surface.";
+}
+
+// =============================================================================
+// Suite 5: ProfilingBehaviorCoupling
+// =============================================================================
+
+// The non-profiling release_fanin_and_check_ready (lines 426-448) does NOT
+// perform CAS(PENDING->READY) before pushing to the ready queue.
+// The profiling overload (lines 450-476) DOES perform the CAS.
+// Document this divergence as a structural coupling of profiling to correctness.
+TEST(ProfilingBehaviorCoupling, ProfilingAndNonProfiling_DifferentStateAfterRelease) {
+    PTO2SharedMemoryHandle *sm = pto2_sm_create(SW, SH);
+    ASSERT_NE(sm, nullptr);
+    uint8_t *heap = (uint8_t *)calloc(PTO2_MAX_RING_DEPTH, SH);
+    ASSERT_NE(heap, nullptr);
+    PTO2SchedulerState sched{};
+    ASSERT_TRUE(pto2_scheduler_init(&sched, sm->header));
+
+    alignas(64) PTO2TaskSlotState slot{};
+    slot.fanin_count = 1;
+    slot.fanin_refcount.store(0, std::memory_order_relaxed);
+    slot.task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
+    slot.active_mask = PTO2_SUBTASK_MASK_AIV0;
+
+    sched.release_fanin_and_check_ready(slot, nullptr);
+
+    PTO2TaskState state = slot.task_state.load(std::memory_order_acquire);
+
+#if PTO2_SCHED_PROFILING
+    // Profiling path: CAS was performed -> READY
+    EXPECT_EQ(state, PTO2_TASK_READY) << "Profiling build: CAS(PENDING->READY) executed before push. "
+                                         "Worker will see READY state when it pops this slot.";
+#else
+    // Non-profiling path: no CAS -> still PENDING
+    EXPECT_EQ(state, PTO2_TASK_PENDING) << "Non-profiling build: slot pushed to ready queue with task_state=PENDING.\n"
+                                           "PTO2_SCHED_PROFILING flag changes CORRECTNESS, not just measurement.\n"
+                                           "See pto_scheduler.h lines 426-448 (non-profiling) vs 450-476 (profiling).";
+#endif
+
+    pto2_scheduler_destroy(&sched);
+    free(heap);
+    pto2_sm_destroy(sm);
+}
+
+// The profiling overload has an additional CAS guard that prevents double-push.
+// The non-profiling overload relies on the caller ensuring exactly-once delivery.
+// Document the API asymmetry as a coupling risk.
+TEST(ProfilingBehaviorCoupling, ProfilingOverload_HasCASGuard_NonProfilingDoesNot) {
+    // Non-profiling signature (lines 426-448):
+    //   bool release_fanin_and_check_ready(slot, local_bufs = nullptr)
+    //   -> pushes unconditionally when fanin met; no CAS guard
+    //
+    // Profiling signature (lines 450-476):
+    //   bool release_fanin_and_check_ready(slot, atomic_count, push_wait, local_bufs)
+    //   -> CAS(PENDING->READY); only pushes if CAS succeeds
+    //   -> if two threads race and both see new_refcount==fanin_count,
+    //     only ONE will win the CAS; the other returns false (no double-push)
+    //
+    // Non-profiling has no such guard: if two threads both see new_refcount==fanin_count
+    // (which shouldn't happen due to fetch_add atomicity, but still an asymmetry),
+    // both would push.
+
+    // Verify the non-profiling path returns true whenever fanin_count is met
+    PTO2SharedMemoryHandle *sm = pto2_sm_create(SW, SH);
+    ASSERT_NE(sm, nullptr);
+    uint8_t *heap = (uint8_t *)calloc(PTO2_MAX_RING_DEPTH, SH);
+    ASSERT_NE(heap, nullptr);
+    PTO2SchedulerState sched{};
+    ASSERT_TRUE(pto2_scheduler_init(&sched, sm->header));
+
+    alignas(64) PTO2TaskSlotState slot{};
+    slot.fanin_count = 2;
+    slot.fanin_refcount.store(0, std::memory_order_relaxed);
+    slot.task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
+    slot.active_mask = PTO2_SUBTASK_MASK_AIV0;
+
+    bool r1 = sched.release_fanin_and_check_ready(slot, nullptr);  // refcount->1, !=2
+    bool r2 = sched.release_fanin_and_check_ready(slot, nullptr);  // refcount->2, ==2
+
+    EXPECT_FALSE(r1) << "First release: refcount=1 != fanin_count=2 -> not ready";
+    EXPECT_TRUE(r2) << "Second release: refcount=2 == fanin_count=2 -> ready, pushed";
+
+    SUCCEED() << "Non-profiling path: return true means 'pushed to queue'. "
+                 "Profiling path: return true means 'CAS succeeded AND pushed'. "
+                 "The distinction matters for exactly-once delivery guarantees "
+                 "under concurrent access -- the non-profiling version trusts "
+                 "fetch_add atomicity alone to prevent double-push.";
+
+    pto2_scheduler_destroy(&sched);
+    free(heap);
+    pto2_sm_destroy(sm);
+}
+
+// Profiling externs are declared inside #if blocks in hot-path headers.
+// In non-profiling builds they are absent, but the conditional preprocessor blocks
+// are part of the header's cognitive surface -- coupling profiling concern to the header.
+TEST(ProfilingBehaviorCoupling, ProfilingExterns_InHotPathHeaders) {
+    // pto_scheduler.h declares (inside #if PTO2_SCHED_PROFILING):
+    //   extern uint64_t g_sched_lock_cycle[];
+    //   extern uint64_t g_sched_fanout_cycle[];
+    //   ... (8+ extern arrays, used in on_mixed_task_complete)
+    //
+    // pto_ring_buffer.h declares (inside #if PTO2_ORCH_PROFILING):
+    //   extern uint64_t g_orch_heap_wait_cycle;
+    //   extern uint64_t g_orch_heap_atomic_count;
+    //   ... (4+ extern scalars, used in heap_ring_try_alloc)
+    //
+    // These externs sit inside headers that are included in hot-path code.
+    // The profiling concern bleeds into the compile model of all translation units
+    // that include these headers.
+
+#if PTO2_SCHED_PROFILING
+    // In profiling build: the externs must be defined somewhere -- test stubs must provide them
+    SUCCEED() << "PTO2_SCHED_PROFILING=1: profiling externs are live in this build. "
+                 "They are declared in pto_scheduler.h and used in on_mixed_task_complete.";
+#else
+    // In non-profiling build: externs are absent -- but the #if blocks remain in the header
+    SUCCEED() << "PTO2_SCHED_PROFILING=0: profiling extern declarations are compiled out. "
+                 "However, the #if PTO2_SCHED_PROFILING blocks in pto_scheduler.h "
+                 "and pto_ring_buffer.h add conditional complexity to every reader "
+                 "of these hot-path headers. Profiling coupling cannot be extracted "
+                 "without modifying the headers themselves.";
+#endif
+
+    // Regardless of flag: the behavioral difference in release_fanin_and_check_ready
+    // means profiling and non-profiling builds have different task state semantics.
+    // This is the most significant coupling: a measurement flag alters correctness.
+    size_t slot_size = sizeof(PTO2TaskSlotState);
+    EXPECT_EQ(slot_size, 64u) << "PTO2TaskSlotState is 64 bytes (1 cache line). "
+                                 "Profiling adds atomic counters to PTO2SchedulerState (tasks_completed, "
+                                 "tasks_consumed) when PTO2_SCHED_PROFILING=1, potentially inflating the struct.";
+}
diff --git a/tests/ut/cpp/pto2_a2a3/test_dispatch_payload.cpp b/tests/ut/cpp/pto2_a2a3/test_dispatch_payload.cpp
new file mode 100644
index 000000000..9911cddbd
--- /dev/null
+++ b/tests/ut/cpp/pto2_a2a3/test_dispatch_payload.cpp
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Unit tests for PTO2DispatchPayload and SPMD context structures.
+ *
+ * Tests layout constants, alignment, static_assert consistency, and the
+ * get_block_idx / get_block_num / get_sub_block_id intrinsic accessors.
+ */
+
+#include <cstdint>
+
+#include <gtest/gtest.h>
+
+#include "intrinsic.h"
+#include "pto2_dispatch_payload.h"
+#include "pto_types.h"
+
+// =============================================================================
+// Compile-time constant consistency
+// =============================================================================
+
+TEST(DispatchPayloadConstants, LocalContextIndex) {
+    // SPMD_LOCAL_CONTEXT_INDEX must equal MAX_TENSOR_ARGS + MAX_SCALAR_ARGS
+    EXPECT_EQ(SPMD_LOCAL_CONTEXT_INDEX, MAX_TENSOR_ARGS + MAX_SCALAR_ARGS);
+}
+
+TEST(DispatchPayloadConstants, GlobalContextIndex) {
+    EXPECT_EQ(SPMD_GLOBAL_CONTEXT_INDEX, SPMD_LOCAL_CONTEXT_INDEX + 1);
+}
+
+TEST(DispatchPayloadConstants, ExtParamsCount) { EXPECT_EQ(PTO2_EXT_PARAMS_COUNT, 2); }
+
+TEST(DispatchPayloadConstants, DispatchMaxArgs) {
+    EXPECT_EQ(PTO2_DISPATCH_MAX_ARGS, MAX_TENSOR_ARGS + MAX_SCALAR_ARGS + PTO2_EXT_PARAMS_COUNT);
+}
+
+// =============================================================================
+// PTO2DispatchPayload layout and alignment
+// =============================================================================
+
+// ABI contract: alignment must match device dispatch requirements.
+TEST(DispatchPayloadLayout, IsAlignedTo64Bytes) { EXPECT_EQ(alignof(PTO2DispatchPayload), 64u); }
+
+TEST(DispatchPayloadLayout, ArgsArrayHasCorrectSize) {
+    PTO2DispatchPayload p{};
+    EXPECT_EQ(sizeof(p.args) / sizeof(p.args[0]), static_cast<size_t>(PTO2_DISPATCH_MAX_ARGS));
+}
+
+// ABI contract: element size must match shared memory layout.
+TEST(DispatchPayloadLayout, ArgElementIs8Bytes) {
+    PTO2DispatchPayload p{};
+    EXPECT_EQ(sizeof(p.args[0]), 8u);
+}
+
+// =============================================================================
+// LocalContext
+// =============================================================================
+
+TEST(LocalContext, FieldsReadWrite) {
+    LocalContext lctx{3, 8};
+    EXPECT_EQ(lctx.block_idx, 3);
+    EXPECT_EQ(lctx.block_num, 8);
+}
+
+TEST(LocalContext, DefaultZero) {
+    LocalContext lctx{};
+    EXPECT_EQ(lctx.block_idx, 0);
+    EXPECT_EQ(lctx.block_num, 0);
+}
+
+// =============================================================================
+// GlobalContext
+// =============================================================================
+
+TEST(GlobalContext, FieldReadWrite) {
+    GlobalContext gctx{1};
+    EXPECT_EQ(gctx.sub_block_id, 1);
+}
+
+// =============================================================================
+// Intrinsic accessor functions
+// =============================================================================
+
+// Build a minimal args[] array with context pointers at the correct indices.
+struct IntrinsicTestSetup {
+    static constexpr int kArgsLen = SPMD_GLOBAL_CONTEXT_INDEX + 1;
+    LocalContext lctx;
+    GlobalContext gctx;
+    uint64_t args[kArgsLen];
+
+    IntrinsicTestSetup(int block_idx, int block_num, int sub_block_id) :
+        lctx{block_idx, block_num},
+        gctx{sub_block_id} {
+        for (auto &a : args)
+            a = 0;
+        args[SPMD_LOCAL_CONTEXT_INDEX] = reinterpret_cast<uint64_t>(&lctx);
+        args[SPMD_GLOBAL_CONTEXT_INDEX] = reinterpret_cast<uint64_t>(&gctx);
+    }
+
+    int64_t *raw() { return reinterpret_cast<int64_t *>(args); }
+};
+
+TEST(IntrinsicAccessors, GetBlockIdx) {
+    IntrinsicTestSetup s(5, 10, 0);
+    EXPECT_EQ(get_block_idx(s.raw()), 5);
+}
+
+TEST(IntrinsicAccessors, GetBlockNum) {
+    IntrinsicTestSetup s(0, 7, 0);
+    EXPECT_EQ(get_block_num(s.raw()), 7);
+}
+
+TEST(IntrinsicAccessors, GetSubBlockId_AIV0) {
+    IntrinsicTestSetup s(0, 1, 0);
+    EXPECT_EQ(get_sub_block_id(s.raw()), 0);
+}
+
+TEST(IntrinsicAccessors, GetSubBlockId_AIV1) {
+    IntrinsicTestSetup s(0, 1, 1);
+    EXPECT_EQ(get_sub_block_id(s.raw()), 1);
+}
+
+TEST(IntrinsicAccessors, BlockIdxAndNumIndependent) {
+    // Changing block_idx must not affect block_num and vice versa
+    IntrinsicTestSetup s(2, 4, 0);
+    EXPECT_EQ(get_block_idx(s.raw()), 2);
+    EXPECT_EQ(get_block_num(s.raw()), 4);
+
+    s.lctx.block_idx = 3;
+    EXPECT_EQ(get_block_idx(s.raw()), 3);
+    EXPECT_EQ(get_block_num(s.raw()), 4);
+}
+
+TEST(IntrinsicAccessors, ContextPointersAreAtCorrectSlots) {
+    IntrinsicTestSetup s(1, 2, 0);
+    // The value at SPMD_LOCAL_CONTEXT_INDEX must point to lctx
+    auto lctx_ptr = reinterpret_cast<LocalContext *>(static_cast<uint64_t>(s.args[SPMD_LOCAL_CONTEXT_INDEX]));
+    EXPECT_EQ(lctx_ptr, &s.lctx);
+
+    auto gctx_ptr = reinterpret_cast<GlobalContext *>(static_cast<uint64_t>(s.args[SPMD_GLOBAL_CONTEXT_INDEX]));
+    EXPECT_EQ(gctx_ptr, &s.gctx);
+}
diff --git a/tests/ut/cpp/pto2_a2a3/test_handshake.cpp b/tests/ut/cpp/pto2_a2a3/test_handshake.cpp
new file mode 100644
index 000000000..859901180
--- /dev/null
+++ b/tests/ut/cpp/pto2_a2a3/test_handshake.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Unit tests for Handshake Protocol macros.
+ *
+ * Tests the ACK/FIN dual-state register encoding/decoding defined in
+ * platform_config.h: MAKE_ACK_VALUE, MAKE_FIN_VALUE, EXTRACT_TASK_ID,
+ * EXTRACT_TASK_STATE, and reserved ID guards.
+ */
+
+#include <gtest/gtest.h>
+#include "common/platform_config.h"
+
+// =============================================================================
+// ACK value encoding (bit 31 = 0)
+// =============================================================================
+
+TEST(HandshakeProtocol, MakeAckValue_Bit31Clear) {
+    uint64_t ack = MAKE_ACK_VALUE(42);
+    // bit 31 must be 0 for ACK
+    EXPECT_EQ(ack & TASK_STATE_MASK, 0u);
+    EXPECT_EQ(EXTRACT_TASK_STATE(ack), TASK_ACK_STATE);
+}
+
+TEST(HandshakeProtocol, MakeAckValue_PreservesTaskId) {
+    for (int task_id : {0, 1, 100, 1000000, 0x7FFFFFFF}) {
+        uint64_t ack = MAKE_ACK_VALUE(task_id);
+        EXPECT_EQ(EXTRACT_TASK_ID(ack), task_id);
+    }
+}
+
+// =============================================================================
+// FIN value encoding (bit 31 = 1)
+// =============================================================================
+
+TEST(HandshakeProtocol, MakeFinValue_Bit31Set) {
+    uint64_t fin = MAKE_FIN_VALUE(42);
+    // bit 31 must be 1 for FIN
+    EXPECT_NE(fin & TASK_STATE_MASK, 0u);
+    EXPECT_EQ(EXTRACT_TASK_STATE(fin), TASK_FIN_STATE);
+}
+
+TEST(HandshakeProtocol, MakeFinValue_PreservesTaskId) {
+    for (int task_id : {0, 1, 100, 1000000, 0x7FFFFFFF}) {
+        uint64_t fin = MAKE_FIN_VALUE(task_id);
+        EXPECT_EQ(EXTRACT_TASK_ID(fin), task_id);
+    }
+}
+
+// =============================================================================
+// Roundtrip: encode -> decode
+// =============================================================================
+
+TEST(HandshakeProtocol, AckRoundtrip) {
+    for (int id = 0; id < 1000; id++) {
+        uint64_t ack = MAKE_ACK_VALUE(id);
+        EXPECT_EQ(EXTRACT_TASK_ID(ack), id);
+        EXPECT_EQ(EXTRACT_TASK_STATE(ack), TASK_ACK_STATE);
+    }
+}
+
+TEST(HandshakeProtocol, FinRoundtrip) {
+    for (int id = 0; id < 1000; id++) {
+        uint64_t fin = MAKE_FIN_VALUE(id);
+        EXPECT_EQ(EXTRACT_TASK_ID(fin), id);
+        EXPECT_EQ(EXTRACT_TASK_STATE(fin), TASK_FIN_STATE);
+    }
+}
+
+// =============================================================================
+// Reserved task IDs
+// =============================================================================
+
+TEST(HandshakeProtocol, ReservedIdGuard_IdleAndExit) {
+    // IDLE and EXIT task IDs must be distinct
+    EXPECT_NE(AICORE_IDLE_TASK_ID, AICORE_EXIT_TASK_ID);
+
+    // Both must be in the reserved range (high values)
+    EXPECT_GT(AICORE_IDLE_TASK_ID, 0x7FFFFFF0u);
+    EXPECT_GT(AICORE_EXIT_TASK_ID, 0x7FFFFFF0u);
+}
+
+TEST(HandshakeProtocol, ReservedIdGuard_IdleValue) {
+    // AICORE_IDLE_VALUE should encode IDLE_TASK_ID with FIN state
+    uint64_t idle = AICORE_IDLE_VALUE;
+    EXPECT_EQ(EXTRACT_TASK_STATE(idle), TASK_FIN_STATE);
+    EXPECT_EQ(EXTRACT_TASK_ID(idle), (int)AICORE_IDLE_TASK_ID);
+}
+
+TEST(HandshakeProtocol, ReservedIdGuard_ExitValue) {
+    // AICORE_EXITED_VALUE should encode EXIT_TASK_ID with FIN state
+    uint64_t exited = AICORE_EXITED_VALUE;
+    EXPECT_EQ(EXTRACT_TASK_STATE(exited), TASK_FIN_STATE);
+    EXPECT_EQ(EXTRACT_TASK_ID(exited), (int)AICORE_EXIT_TASK_ID);
+}
+
+// =============================================================================
+// Exit signal
+// =============================================================================
+
+TEST(HandshakeProtocol, ExitSignalValue) {
+    // AICORE_EXIT_SIGNAL is a special dispatch value
+    EXPECT_EQ(AICORE_EXIT_SIGNAL, 0x7FFFFFF0u);
+}
+
+// =============================================================================
+// Invalid task ID sentinel
+// =============================================================================
+
+TEST(HandshakeProtocol, InvalidTaskSentinel) { EXPECT_EQ(AICPU_TASK_INVALID, -1); }
diff --git a/tests/ut/cpp/pto2_a2a3/test_orchestrator_fatal.cpp b/tests/ut/cpp/pto2_a2a3/test_orchestrator_fatal.cpp
new file mode 100644
index 000000000..7eb012216
--- /dev/null
+++ b/tests/ut/cpp/pto2_a2a3/test_orchestrator_fatal.cpp
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * UT for the orchestrator-side fatal reporting path.
+ *
+ * Targets pto2_orch_report_fatal (pto_orchestrator.cpp) and verifies:
+ *  - orch->fatal latches to true on any non-zero error code
+ *  - the first non-zero code wins via CAS into sm_header->orch_error_code
+ *  - subsequent fatal reports do NOT overwrite the first code
+ *  - PTO2_ERROR_NONE never latches the shared-memory code (but still flips
+ *    the local fatal flag -- by design, callers may use it to mark fatal
+ *    without writing a code)
+ *  - resilience when sm_handle / header is null (no crash, local flag flips)
+ *
+ * This test exercises the real symbol against a fully-initialized
+ * orchestrator + shared memory pair, complementing the fake-runtime test
+ * (test_a2a3_pto2_fatal.cpp) that only validates the ops-table dispatch.
+ */
+
+#include <gtest/gtest.h>
+
+#include <cstdint>
+#include <cstdlib>
+
+#include "pto_orchestrator.h"
+#include "pto_runtime_status.h"
+#include "pto_scheduler.h"
+#include "pto_shared_memory.h"
+
+namespace {
+
+constexpr uint64_t kHeapSize = 64 * 1024;
+constexpr int32_t kWindowSize = 64;
+constexpr int32_t kDepPool = 256;
+
+class OrchestratorFatalTest : public ::testing::Test {
+protected:
+    PTO2SharedMemoryHandle *sm_ = nullptr;
+    PTO2SchedulerState sched_{};
+    PTO2OrchestratorState orch_{};
+    uint8_t *gm_heap_ = nullptr;
+    bool sched_ok_ = false;
+    bool orch_ok_ = false;
+
+    void SetUp() override {
+        sm_ = pto2_sm_create(kWindowSize, kHeapSize);
+        ASSERT_NE(sm_, nullptr);
+
+        gm_heap_ = static_cast<uint8_t *>(std::calloc(PTO2_MAX_RING_DEPTH, kHeapSize));
+        ASSERT_NE(gm_heap_, nullptr);
+
+        sched_ok_ = pto2_scheduler_init(&sched_, sm_->header, kDepPool);
+        ASSERT_TRUE(sched_ok_);
+
+        orch_ok_ = pto2_orchestrator_init(&orch_, sm_->header, gm_heap_, kHeapSize, kDepPool);
+        ASSERT_TRUE(orch_ok_);
+    }
+
+    void TearDown() override {
+        if (orch_ok_) pto2_orchestrator_destroy(&orch_);
+        if (sched_ok_) pto2_scheduler_destroy(&sched_);
+        if (gm_heap_) std::free(gm_heap_);
+        if (sm_) pto2_sm_destroy(sm_);
+    }
+
+    int32_t shared_orch_code() const { return sm_->header->orch_error_code.load(std::memory_order_acquire); }
+};
+
+}  // namespace
+
+// ---------- baseline ----------
+
+TEST_F(OrchestratorFatalTest, InitialState_NoFatalNoSharedCode) {
+    // Verify no fatal state via the observable shared memory output
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_NONE);
+}
+
+// ---------- happy path: single fatal latches both local flag and shared code ----------
+
+TEST_F(OrchestratorFatalTest, ReportFatal_SetsLocalFlagAndSharedCode) {
+    pto2_orch_report_fatal(&orch_, PTO2_ERROR_HEAP_RING_DEADLOCK, "test", "deadlock at ring %d", 3);
+
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_HEAP_RING_DEADLOCK);
+}
+
+// ---------- CAS first-writer-wins ----------
+
+TEST_F(OrchestratorFatalTest, SecondReportFatal_DoesNotOverwriteSharedCode) {
+    pto2_orch_report_fatal(&orch_, PTO2_ERROR_HEAP_RING_DEADLOCK, "test", nullptr);
+    pto2_orch_report_fatal(&orch_, PTO2_ERROR_DEP_POOL_OVERFLOW, "test", nullptr);
+
+    // Second report must NOT overwrite the first latched code.
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_HEAP_RING_DEADLOCK);
+}
+
+TEST_F(OrchestratorFatalTest, RepeatedSameCode_StaysLatched) {
+    pto2_orch_report_fatal(&orch_, PTO2_ERROR_INVALID_ARGS, "test", nullptr);
+    pto2_orch_report_fatal(&orch_, PTO2_ERROR_INVALID_ARGS, "test", nullptr);
+
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_INVALID_ARGS);
+}
+
+// ---------- PTO2_ERROR_NONE: marks fatal locally, does NOT touch shared code ----------
+
+TEST_F(OrchestratorFatalTest, ReportFatalWithErrorNone_DoesNotWriteSharedCode) {
+    pto2_orch_report_fatal(&orch_, PTO2_ERROR_NONE, "test", nullptr);
+
+    // Local fatal flag flips (tested via another report not latching a different code),
+    // but no code is written to shared memory.
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_NONE);
+}
+
+// ---------- PTO2_ERROR_NONE first does not block a real code from latching ----------
+
+TEST_F(OrchestratorFatalTest, ErrorNoneFirst_RealCodeStillLatchesAfter) {
+    pto2_orch_report_fatal(&orch_, PTO2_ERROR_NONE, "test", nullptr);
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_NONE);
+
+    pto2_orch_report_fatal(&orch_, PTO2_ERROR_SCOPE_DEADLOCK, "test", nullptr);
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_SCOPE_DEADLOCK);
+}
+
+// ---------- coverage of every defined orchestrator code ----------
+
+TEST_F(OrchestratorFatalTest, EveryOrchCode_LatchesIntoSharedMemory) {
+    const int32_t codes[] = {
+        PTO2_ERROR_SCOPE_DEADLOCK,
+        PTO2_ERROR_HEAP_RING_DEADLOCK,
+        PTO2_ERROR_FLOW_CONTROL_DEADLOCK,
+        PTO2_ERROR_DEP_POOL_OVERFLOW,
+        PTO2_ERROR_INVALID_ARGS,
+        PTO2_ERROR_DEPENDENCY_OVERFLOW,
+        PTO2_ERROR_REQUIRE_SYNC_START_INVALID,
+        PTO2_ERROR_TENSOR_WAIT_TIMEOUT,
+        PTO2_ERROR_EXPLICIT_ORCH_FATAL,
+    };
+    for (int32_t code : codes) {
+        // Reset latches between iterations. Direct field access is unavoidable here
+        // since there is no public reset API for the orchestrator fatal state.
+        sm_->header->orch_error_code.store(PTO2_ERROR_NONE, std::memory_order_release);
+        orch_.fatal = false;
+
+        pto2_orch_report_fatal(&orch_, code, "test", "code=%d", code);
+
+        SCOPED_TRACE(testing::Message() << "code=" << code);
+        EXPECT_EQ(shared_orch_code(), code);
+    }
+}
+
+// ---------- format-string variants must not crash ----------
+
+TEST_F(OrchestratorFatalTest, NullFmt_DoesNotCrash) {
+    pto2_orch_report_fatal(&orch_, PTO2_ERROR_INVALID_ARGS, "func", nullptr);
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_INVALID_ARGS);
+}
+
+TEST_F(OrchestratorFatalTest, EmptyFmt_DoesNotCrash) {
+    pto2_orch_report_fatal(&orch_, PTO2_ERROR_INVALID_ARGS, "func", "");
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_INVALID_ARGS);
+}
+
+TEST_F(OrchestratorFatalTest, FmtWithVarArgs_DoesNotCrash) {
+    pto2_orch_report_fatal(
+        &orch_, PTO2_ERROR_TENSOR_WAIT_TIMEOUT, "func", "tensor=%p slot=%d msg=%s",
+        reinterpret_cast<void *>(0xdeadbeef), 17, "boom"
+    );
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_TENSOR_WAIT_TIMEOUT);
+}
+
+// ---------- end-to-end: status helper sees latched code ----------
+
+TEST_F(OrchestratorFatalTest, StatusHelperReadsLatchedOrchCode) {
+    pto2_orch_report_fatal(&orch_, PTO2_ERROR_FLOW_CONTROL_DEADLOCK, "func", nullptr);
+
+    int32_t orch_code = shared_orch_code();
+    int32_t sched_code = sm_->header->sched_error_code.load(std::memory_order_acquire);
+    EXPECT_EQ(pto2_runtime_status_from_error_codes(orch_code, sched_code), -PTO2_ERROR_FLOW_CONTROL_DEADLOCK);
+}
diff --git a/tests/ut/cpp/pto2_a2a3/test_orchestrator_submit.cpp b/tests/ut/cpp/pto2_a2a3/test_orchestrator_submit.cpp
new file mode 100644
index 000000000..a5afb3b3b
--- /dev/null
+++ b/tests/ut/cpp/pto2_a2a3/test_orchestrator_submit.cpp
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * Orchestrator submit-path UT.
+ *
+ * Covers pto2_submit_mixed_task, pto2_alloc_tensors, pto2_orchestrator_done,
+ * and pto2_orchestrator_set_scheduler on a fully initialized
+ * (TMR) system.
+ *
+ * Follows AAA and FIRST: each TEST_F builds a fresh TMRSystem, exercises
+ * one behavior, and tears the system down in TearDown().
+ */
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <cstdint>
+#include <cstdlib>
+
+#include "pto_orchestration_api.h"  // make_tensor_external, TensorCreateInfo ctor
+#include "pto_orchestrator.h"
+#include "pto_ring_buffer.h"
+#include "pto_scheduler.h"
+#include "pto_shared_memory.h"
+#include "pto_submit_types.h"
+#include "pto_tensormap.h"
+#include "tensor.h"
+
+namespace {
+
+constexpr uint64_t kHeapSize = 64 * 1024;
+constexpr int32_t kWindowSize = 64;
+constexpr int32_t kDepPool = 256;
+
+// -----------------------------------------------------------------------------
+// Fixture: minimal TMR system for orchestrator-level tests.
+// -----------------------------------------------------------------------------
+class OrchestratorSubmitTest : public ::testing::Test {
+protected:
+    PTO2SharedMemoryHandle *sm_ = nullptr;
+    PTO2SchedulerState sched_{};
+    PTO2OrchestratorState orch_{};
+    uint8_t *gm_heap_ = nullptr;
+    bool sched_ok_ = false;
+    bool orch_ok_ = false;
+
+    void SetUp() override {
+        sm_ = pto2_sm_create(kWindowSize, kHeapSize);
+        ASSERT_NE(sm_, nullptr);
+
+        gm_heap_ = static_cast<uint8_t *>(std::calloc(PTO2_MAX_RING_DEPTH, kHeapSize));
+        ASSERT_NE(gm_heap_, nullptr);
+
+        sched_ok_ = pto2_scheduler_init(&sched_, sm_->header, kDepPool);
+        ASSERT_TRUE(sched_ok_);
+
+        orch_ok_ = pto2_orchestrator_init(&orch_, sm_->header, gm_heap_, kHeapSize, kDepPool);
+        ASSERT_TRUE(orch_ok_);
+
+        pto2_orchestrator_set_scheduler(&orch_, &sched_);
+    }
+
+    void TearDown() override {
+        if (orch_ok_) pto2_orchestrator_destroy(&orch_);
+        if (sched_ok_) pto2_scheduler_destroy(&sched_);
+        if (gm_heap_) std::free(gm_heap_);
+        if (sm_) pto2_sm_destroy(sm_);
+    }
+
+    // Helper: build a minimal TensorCreateInfo owning one FP32 scalar output.
+    static TensorCreateInfo make_scalar_ci() {
+        static const uint32_t kShape[1] = {1};
+        return TensorCreateInfo(kShape, 1, DataType::FLOAT32);
+    }
+
+    bool has_orch_error() const {
+        return sm_->header->orch_error_code.load(std::memory_order_acquire) != PTO2_ERROR_NONE;
+    }
+};
+
+}  // namespace
+
+// ---------- set_scheduler ----------
+
+TEST_F(OrchestratorSubmitTest, SetScheduler_StoresPointer) {
+    PTO2SchedulerState other{};
+    pto2_orchestrator_set_scheduler(&orch_, &other);
+    // Direct field read: no public getter exists for the scheduler pointer.
+    EXPECT_EQ(orch_.scheduler, &other);
+
+    // Restore for TearDown.
+    pto2_orchestrator_set_scheduler(&orch_, &sched_);
+}
+
+// ---------- alloc_tensors: argument validation ----------
+
+TEST_F(OrchestratorSubmitTest, AllocTensors_EmptyArgs_MarksFatal) {
+    Arg args;  // no tensors, no scalars
+
+    TaskOutputTensors result = pto2_alloc_tensors(&orch_, args);
+
+    EXPECT_TRUE(result.empty());
+    EXPECT_TRUE(has_orch_error());
+}
+
+TEST_F(OrchestratorSubmitTest, AllocTensors_WithScalars_MarksFatal) {
+    TensorCreateInfo ci = make_scalar_ci();
+    Arg args;
+    args.add_output(ci);
+    args.add_scalar(uint64_t{42});
+
+    TaskOutputTensors result = pto2_alloc_tensors(&orch_, args);
+
+    EXPECT_TRUE(result.empty());
+    EXPECT_TRUE(has_orch_error());
+}
+
+TEST_F(OrchestratorSubmitTest, AllocTensors_InputArg_MarksFatal) {
+    // alloc_tensors only accepts OUTPUT TensorCreateInfo args.
+    uint32_t shape[1] = {1};
+    Tensor input = make_tensor_external(reinterpret_cast<void *>(0x1000), shape, 1);
+    Arg args;
+    args.add_input(input);
+
+    TaskOutputTensors result = pto2_alloc_tensors(&orch_, args);
+
+    EXPECT_TRUE(result.empty());
+    EXPECT_TRUE(has_orch_error());
+}
+
+TEST_F(OrchestratorSubmitTest, AllocTensors_OutputOnly_ReturnsMaterializedTensors) {
+    // Arrange: two output CIs, inside an active scope.
+    TensorCreateInfo ci1 = make_scalar_ci();
+    TensorCreateInfo ci2 = make_scalar_ci();
+    Arg args;
+    args.add_output(ci1, ci2);
+
+    // Act
+    pto2_scope_begin(&orch_);
+    TaskOutputTensors result = pto2_alloc_tensors(&orch_, args);
+    pto2_scope_end(&orch_);
+
+    // Assert
+    EXPECT_FALSE(has_orch_error());
+    EXPECT_EQ(result.size(), 2U);
+}
+
+TEST_F(OrchestratorSubmitTest, AllocTensors_AfterFatal_ReturnsEmpty) {
+    // Arrange: force fatal.
+    pto2_orch_report_fatal(&orch_, PTO2_ERROR_EXPLICIT_ORCH_FATAL, "UT", nullptr);
+    ASSERT_TRUE(has_orch_error());
+
+    TensorCreateInfo ci = make_scalar_ci();
+    Arg args;
+    args.add_output(ci);
+
+    // Act
+    TaskOutputTensors result = pto2_alloc_tensors(&orch_, args);
+
+    // Assert
+    EXPECT_TRUE(result.empty());
+}
+
+// ---------- submit_mixed_task ----------
+
+TEST_F(OrchestratorSubmitTest, SubmitMixedTask_AfterFatal_ReturnsEmpty) {
+    // Arrange: pre-fatal state
+    pto2_orch_report_fatal(&orch_, PTO2_ERROR_EXPLICIT_ORCH_FATAL, "UT", nullptr);
+
+    MixedKernels mixed;
+    mixed.aic_kernel_id = 0;
+    Arg args;
+
+    // Act
+    TaskOutputTensors result = pto2_submit_mixed_task(&orch_, mixed, args);
+
+    // Assert
+    EXPECT_TRUE(result.empty());
+}
+
+TEST_F(OrchestratorSubmitTest, SubmitMixedTask_ArgWithError_MarksFatalInvalidArgs) {
+    // Arrange: craft an Arg with has_error set.
+    // Calling add_input after add_scalar triggers the ordering error path.
+    uint32_t shape[1] = {1};
+    Tensor t = make_tensor_external(reinterpret_cast<void *>(0x1000), shape, 1);
+    Arg args;
+    args.add_scalar(uint64_t{1});
+    args.add_input(t);  // illegal ordering -> has_error = true
+    ASSERT_TRUE(args.has_error);
+
+    MixedKernels mixed;
+    mixed.aic_kernel_id = 0;
+
+    // Act
+    pto2_scope_begin(&orch_);
+    TaskOutputTensors result = pto2_submit_mixed_task(&orch_, mixed, args);
+    pto2_scope_end(&orch_);
+
+    // Assert
+    EXPECT_TRUE(result.empty());
+    EXPECT_TRUE(has_orch_error());
+}
+
+TEST_F(OrchestratorSubmitTest, SubmitMixedTask_PureInputOnly_Succeeds) {
+    // Arrange: one input tensor, one AIC kernel, within a scope.
+    uint32_t shape[1] = {1};
+    Tensor input = make_tensor_external(reinterpret_cast<void *>(0x2000), shape, 1);
+
+    Arg args;
+    args.add_input(input);
+    ASSERT_FALSE(args.has_error);
+
+    MixedKernels mixed;
+    mixed.aic_kernel_id = 7;  // any non-invalid id
+
+    // Act
+    pto2_scope_begin(&orch_);
+    TaskOutputTensors result = pto2_submit_mixed_task(&orch_, mixed, args);
+    pto2_scope_end(&orch_);
+
+    // Assert: submit returns (no outputs), and no fatal state was set.
+    EXPECT_TRUE(result.empty());
+    EXPECT_FALSE(has_orch_error());
+}
+
+TEST_F(OrchestratorSubmitTest, SubmitMixedTask_OutputTensor_MaterializesResult) {
+    // Arrange: one OUTPUT TensorCreateInfo -> task produces one tensor.
+    TensorCreateInfo ci = make_scalar_ci();
+    Arg args;
+    args.add_output(ci);
+
+    MixedKernels mixed;
+    mixed.aic_kernel_id = 1;
+
+    // Act
+    pto2_scope_begin(&orch_);
+    TaskOutputTensors result = pto2_submit_mixed_task(&orch_, mixed, args);
+    pto2_scope_end(&orch_);
+
+    // Assert
+    EXPECT_FALSE(has_orch_error());
+    EXPECT_EQ(result.size(), 1U);
+}
+
+// ---------- orchestrator_done ----------
+
+TEST_F(OrchestratorSubmitTest, OrchestratorDone_SetsSharedMemoryFlag) {
+    // Arrange
+    ASSERT_EQ(sm_->header->orchestrator_done.load(), 0);
+
+    // Act
+    pto2_orchestrator_done(&orch_);
+
+    // Assert
+    EXPECT_EQ(sm_->header->orchestrator_done.load(std::memory_order_acquire), 1);
+}
+
+TEST_F(OrchestratorSubmitTest, OrchestratorDone_IsIdempotent) {
+    pto2_orchestrator_done(&orch_);
+    pto2_orchestrator_done(&orch_);
+
+    // Flag stays 1 -- store is release-set, not increment.
+    EXPECT_EQ(sm_->header->orchestrator_done.load(std::memory_order_acquire), 1);
+}
diff --git a/tests/ut/cpp/pto2_a2a3/test_ready_queue.cpp b/tests/ut/cpp/pto2_a2a3/test_ready_queue.cpp
new file mode 100644
index 000000000..28de1f8cd
--- /dev/null
+++ b/tests/ut/cpp/pto2_a2a3/test_ready_queue.cpp
@@ -0,0 +1,409 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Unit tests for PTO2ReadyQueue and PTO2LocalReadyBuffer from pto_scheduler.h
+ *
+ * Tests the lock-free bounded MPMC queue (Vyukov design) and the thread-local
+ * ready buffer used for local-first dispatch optimization.
+ */
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <algorithm>
+#include <set>
+#include <thread>
+#include <vector>
+
+#include "pto_scheduler.h"
+
+// =============================================================================
+// ReadyQueue: Single-threaded tests
+// =============================================================================
+
+class ReadyQueueTest : public ::testing::Test {
+protected:
+    static constexpr uint64_t kCapacity = 16;  // Power of 2
+
+    PTO2ReadyQueue queue;
+
+    void SetUp() override { ASSERT_TRUE(pto2_ready_queue_init(&queue, kCapacity)); }
+
+    void TearDown() override { pto2_ready_queue_destroy(&queue); }
+};
+
+// 1. Empty pop returns nullptr
+TEST_F(ReadyQueueTest, EmptyPopReturnsNullptr) { EXPECT_EQ(queue.pop(), nullptr); }
+
+// 2. Single push/pop returns correct item
+TEST_F(ReadyQueueTest, SinglePushPop) {
+    PTO2TaskSlotState item;
+    ASSERT_TRUE(queue.push(&item));
+
+    PTO2TaskSlotState *result = queue.pop();
+    EXPECT_EQ(result, &item);
+}
+
+// 3. FIFO ordering: push A,B,C then pop A,B,C
+TEST_F(ReadyQueueTest, FIFOOrdering) {
+    PTO2TaskSlotState a, b, c;
+
+    ASSERT_TRUE(queue.push(&a));
+    ASSERT_TRUE(queue.push(&b));
+    ASSERT_TRUE(queue.push(&c));
+
+    EXPECT_EQ(queue.pop(), &a);
+    EXPECT_EQ(queue.pop(), &b);
+    EXPECT_EQ(queue.pop(), &c);
+    EXPECT_EQ(queue.pop(), nullptr);
+}
+
+// 4. Queue full: push returns false at capacity
+TEST_F(ReadyQueueTest, QueueFullReturnsFalse) {
+    std::vector<PTO2TaskSlotState> items(kCapacity);
+
+    for (uint64_t i = 0; i < kCapacity; i++) {
+        ASSERT_TRUE(queue.push(&items[i]));
+    }
+
+    PTO2TaskSlotState extra;
+    EXPECT_FALSE(queue.push(&extra));
+}
+
+// 5. Slot reuse after full drain (push/pop cycle)
+TEST_F(ReadyQueueTest, SlotReuseAfterFullDrain) {
+    std::vector<PTO2TaskSlotState> items(kCapacity);
+
+    // Fill the queue
+    for (uint64_t i = 0; i < kCapacity; i++) {
+        ASSERT_TRUE(queue.push(&items[i]));
+    }
+
+    // Drain the queue
+    for (uint64_t i = 0; i < kCapacity; i++) {
+        EXPECT_EQ(queue.pop(), &items[i]);
+    }
+    EXPECT_EQ(queue.pop(), nullptr);
+
+    // Refill and re-drain to verify slot reuse
+    for (uint64_t i = 0; i < kCapacity; i++) {
+        ASSERT_TRUE(queue.push(&items[i]));
+    }
+    for (uint64_t i = 0; i < kCapacity; i++) {
+        EXPECT_EQ(queue.pop(), &items[i]);
+    }
+    EXPECT_EQ(queue.pop(), nullptr);
+}
+
+// 6. push_batch: batch enqueue then individual dequeue
+TEST_F(ReadyQueueTest, PushBatchThenIndividualPop) {
+    constexpr int kBatchSize = 5;
+    PTO2TaskSlotState items[kBatchSize];
+    PTO2TaskSlotState *ptrs[kBatchSize];
+    for (int i = 0; i < kBatchSize; i++) {
+        ptrs[i] = &items[i];
+    }
+
+    queue.push_batch(ptrs, kBatchSize);
+
+    for (int i = 0; i < kBatchSize; i++) {
+        EXPECT_EQ(queue.pop(), &items[i]);
+    }
+    EXPECT_EQ(queue.pop(), nullptr);
+}
+
+// 7. push_batch count=0: no-op
+TEST_F(ReadyQueueTest, PushBatchZeroIsNoop) {
+    queue.push_batch(nullptr, 0);
+
+    EXPECT_EQ(queue.size(), 0u);
+    EXPECT_EQ(queue.pop(), nullptr);
+}
+
+// 8. pop_batch: push 10, pop_batch(5) returns 5
+TEST_F(ReadyQueueTest, PopBatchReturnsFive) {
+    constexpr int kPushCount = 10;
+    PTO2TaskSlotState items[kPushCount];
+
+    for (int i = 0; i < kPushCount; i++) {
+        ASSERT_TRUE(queue.push(&items[i]));
+    }
+
+    PTO2TaskSlotState *out[5];
+    int popped = queue.pop_batch(out, 5);
+    EXPECT_EQ(popped, 5);
+
+    for (int i = 0; i < 5; i++) {
+        EXPECT_EQ(out[i], &items[i]);
+    }
+}
+
+// 9. pop_batch partial: push 3, pop_batch(5) returns 3
+TEST_F(ReadyQueueTest, PopBatchPartial) {
+    constexpr int kPushCount = 3;
+    PTO2TaskSlotState items[kPushCount];
+
+    for (int i = 0; i < kPushCount; i++) {
+        ASSERT_TRUE(queue.push(&items[i]));
+    }
+
+    PTO2TaskSlotState *out[5];
+    int popped = queue.pop_batch(out, 5);
+    EXPECT_EQ(popped, kPushCount);
+
+    for (int i = 0; i < kPushCount; i++) {
+        EXPECT_EQ(out[i], &items[i]);
+    }
+}
+
+// 10. pop_batch empty: returns 0
+TEST_F(ReadyQueueTest, PopBatchEmpty) {
+    PTO2TaskSlotState *out[5];
+    int popped = queue.pop_batch(out, 5);
+    EXPECT_EQ(popped, 0);
+}
+
+// 11. size() accuracy after various push/pop
+TEST_F(ReadyQueueTest, SizeAccuracy) {
+    EXPECT_EQ(queue.size(), 0u);
+
+    PTO2TaskSlotState items[8];
+
+    queue.push(&items[0]);
+    EXPECT_EQ(queue.size(), 1u);
+
+    queue.push(&items[1]);
+    queue.push(&items[2]);
+    EXPECT_EQ(queue.size(), 3u);
+
+    queue.pop();
+    EXPECT_EQ(queue.size(), 2u);
+
+    queue.pop();
+    queue.pop();
+    EXPECT_EQ(queue.size(), 0u);
+
+    // Push 5 more
+    for (int i = 0; i < 5; i++) {
+        queue.push(&items[i]);
+    }
+    EXPECT_EQ(queue.size(), 5u);
+}
+
+// =============================================================================
+// ReadyQueue: Multi-threaded tests
+// =============================================================================
+
+class ReadyQueueMTTest : public ::testing::Test {
+protected:
+    static constexpr uint64_t kCapacity = 1024;  // Power of 2
+
+    PTO2ReadyQueue queue;
+
+    void SetUp() override { ASSERT_TRUE(pto2_ready_queue_init(&queue, kCapacity)); }
+
+    void TearDown() override { pto2_ready_queue_destroy(&queue); }
+};
+
+// 12. 2 producers / 2 consumers: all items consumed exactly once
+TEST_F(ReadyQueueMTTest, TwoProducersTwoConsumers) {
+    constexpr int kItemsPerProducer = 200;
+    constexpr int kTotalItems = kItemsPerProducer * 2;
+
+    std::vector<PTO2TaskSlotState> items(kTotalItems);
+
+    std::atomic<int> produced{0};
+    std::atomic<bool> producers_done{false};
+
+    // Tracking: atomic counter per item to verify exactly-once consumption
+    // Use pointer identity (index into items array) instead of struct field tagging
+    std::vector<std::atomic<int>> consumed_count(kTotalItems);
+    for (int i = 0; i < kTotalItems; i++) {
+        consumed_count[i].store(0, std::memory_order_relaxed);
+    }
+
+    auto item_index = [&](PTO2TaskSlotState *s) -> int {
+        return static_cast<int>(s - items.data());
+    };
+
+    auto producer = [&](int offset) {
+        for (int i = 0; i < kItemsPerProducer; i++) {
+            while (!queue.push(&items[offset + i])) {
+                // Queue full, retry
+            }
+        }
+        produced.fetch_add(kItemsPerProducer, std::memory_order_release);
+    };
+
+    auto consumer = [&](std::vector<PTO2TaskSlotState *> &results) {
+        while (true) {
+            PTO2TaskSlotState *item = queue.pop();
+            if (item != nullptr) {
+                results.push_back(item);
+                consumed_count[item_index(item)].fetch_add(1, std::memory_order_relaxed);
+            } else if (producers_done.load(std::memory_order_acquire)) {
+                // Drain remaining
+                while ((item = queue.pop()) != nullptr) {
+                    results.push_back(item);
+                    consumed_count[item_index(item)].fetch_add(1, std::memory_order_relaxed);
+                }
+                break;
+            }
+        }
+    };
+
+    std::vector<PTO2TaskSlotState *> results_c1, results_c2;
+    std::thread p1(producer, 0);
+    std::thread p2(producer, kItemsPerProducer);
+    std::thread c1(consumer, std::ref(results_c1));
+    std::thread c2(consumer, std::ref(results_c2));
+
+    p1.join();
+    p2.join();
+    producers_done.store(true, std::memory_order_release);
+    c1.join();
+    c2.join();
+
+    // Verify all items consumed exactly once
+    int total_consumed = static_cast<int>(results_c1.size() + results_c2.size());
+    EXPECT_EQ(total_consumed, kTotalItems);
+
+    for (int i = 0; i < kTotalItems; i++) {
+        EXPECT_EQ(consumed_count[i].load(), 1)
+            << "Item " << i << " consumed " << consumed_count[i].load() << " times (expected 1)";
+    }
+}
+
+// 13. 1 producer / N consumers: all items consumed exactly once
+TEST_F(ReadyQueueMTTest, OneProducerNConsumers) {
+    constexpr int kTotalItems = 500;
+    constexpr int kNumConsumers = 4;
+
+    std::vector<PTO2TaskSlotState> items(kTotalItems);
+
+    std::atomic<bool> producer_done{false};
+    std::vector<std::atomic<int>> consumed_count(kTotalItems);
+    for (int i = 0; i < kTotalItems; i++) {
+        consumed_count[i].store(0, std::memory_order_relaxed);
+    }
+
+    auto item_index = [&](PTO2TaskSlotState *s) -> int {
+        return static_cast<int>(s - items.data());
+    };
+
+    auto producer = [&]() {
+        for (int i = 0; i < kTotalItems; i++) {
+            while (!queue.push(&items[i])) {
+                // Queue full, retry
+            }
+        }
+        producer_done.store(true, std::memory_order_release);
+    };
+
+    std::atomic<int> total_consumed{0};
+
+    auto consumer = [&]() {
+        while (true) {
+            PTO2TaskSlotState *item = queue.pop();
+            if (item != nullptr) {
+                consumed_count[item_index(item)].fetch_add(1, std::memory_order_relaxed);
+                total_consumed.fetch_add(1, std::memory_order_relaxed);
+            } else if (producer_done.load(std::memory_order_acquire)) {
+                // Drain remaining
+                while ((item = queue.pop()) != nullptr) {
+                    consumed_count[item_index(item)].fetch_add(1, std::memory_order_relaxed);
+                    total_consumed.fetch_add(1, std::memory_order_relaxed);
+                }
+                break;
+            }
+        }
+    };
+
+    std::thread prod(producer);
+    std::vector<std::thread> consumers;
+    for (int i = 0; i < kNumConsumers; i++) {
+        consumers.emplace_back(consumer);
+    }
+
+    prod.join();
+    for (auto &c : consumers) {
+        c.join();
+    }
+
+    EXPECT_EQ(total_consumed.load(), kTotalItems);
+
+    for (int i = 0; i < kTotalItems; i++) {
+        EXPECT_EQ(consumed_count[i].load(), 1)
+            << "Item " << i << " consumed " << consumed_count[i].load() << " times (expected 1)";
+    }
+}
+
+// =============================================================================
+// LocalReadyBuffer tests
+// =============================================================================
+
+class LocalReadyBufferTest : public ::testing::Test {
+protected:
+    static constexpr int kCapacity = 8;
+
+    PTO2LocalReadyBuffer buffer;
+    PTO2TaskSlotState *backing[kCapacity];
+
+    void SetUp() override { buffer.reset(backing, kCapacity); }
+};
+
+// 14. reset produces empty buffer that accepts pushes
+TEST_F(LocalReadyBufferTest, ResetSetsCleanState) {
+    // After reset, buffer should behave as empty
+    EXPECT_EQ(buffer.pop(), nullptr) << "Fresh buffer is empty";
+
+    // Push and verify it works
+    PTO2TaskSlotState a, b;
+    ASSERT_TRUE(buffer.try_push(&a));
+    ASSERT_TRUE(buffer.try_push(&b));
+
+    // Reset and verify empty behavior is restored
+    buffer.reset(backing, kCapacity);
+    EXPECT_EQ(buffer.pop(), nullptr) << "Buffer is empty after reset";
+
+    // Should accept full capacity of pushes again
+    PTO2TaskSlotState items[kCapacity];
+    for (int i = 0; i < kCapacity; i++) {
+        EXPECT_TRUE(buffer.try_push(&items[i]));
+    }
+    EXPECT_FALSE(buffer.try_push(&a)) << "Full after pushing capacity items post-reset";
+}
+
+// 15. try_push/pop LIFO: push A,B -> pop returns B,A
+TEST_F(LocalReadyBufferTest, LIFOOrdering) {
+    PTO2TaskSlotState a, b;
+
+    ASSERT_TRUE(buffer.try_push(&a));
+    ASSERT_TRUE(buffer.try_push(&b));
+
+    EXPECT_EQ(buffer.pop(), &b);
+    EXPECT_EQ(buffer.pop(), &a);
+    EXPECT_EQ(buffer.pop(), nullptr);
+}
+
+// 16. try_push full: returns false at capacity
+TEST_F(LocalReadyBufferTest, TryPushFullReturnsFalse) {
+    PTO2TaskSlotState items[kCapacity + 1];
+
+    for (int i = 0; i < kCapacity; i++) {
+        ASSERT_TRUE(buffer.try_push(&items[i]));
+    }
+
+    EXPECT_FALSE(buffer.try_push(&items[kCapacity]));
+}
+
+// 17. pop empty: returns nullptr
+TEST_F(LocalReadyBufferTest, PopEmptyReturnsNullptr) { EXPECT_EQ(buffer.pop(), nullptr); }
diff --git a/tests/ut/cpp/pto2_a2a3/test_ring_buffer.cpp b/tests/ut/cpp/pto2_a2a3/test_ring_buffer.cpp
new file mode 100644
index 000000000..4dc4d42e2
--- /dev/null
+++ b/tests/ut/cpp/pto2_a2a3/test_ring_buffer.cpp
@@ -0,0 +1,514 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Unit tests for PTO2TaskAllocator and PTO2DepListPool from pto_ring_buffer.h
+ *
+ * Tests ring buffer allocation, heap bump logic, dependency list pool,
+ * and known boundary conditions including a bug candidate in
+ * try_bump_heap wrap-around when tail == alloc_size.
+ */
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <cstring>
+#include <vector>
+
+#include "pto_ring_buffer.h"
+
+// =============================================================================
+// Helpers
+// =============================================================================
+
+static constexpr int32_t kWindowSize = 16;   // Power of 2, small for testing
+static constexpr uint64_t kHeapSize = 1024;  // Small heap for boundary testing
+
+/**
+ * Test fixture for PTO2TaskAllocator tests.
+ *
+ * Sets up a descriptor array, heap buffer, and atomic flow-control variables.
+ * last_alive starts at 0, so tasks 0..window_size-2 can be allocated before
+ * the ring is considered full (active = local_task_id - last_alive + 1 < window_size).
+ */
+class TaskAllocatorTest : public ::testing::Test {
+protected:
+    void SetUp() override {
+        descriptors_.resize(kWindowSize);
+        std::memset(descriptors_.data(), 0, sizeof(PTO2TaskDescriptor) * kWindowSize);
+        heap_buf_.resize(kHeapSize, 0);
+
+        current_index_.store(0, std::memory_order_relaxed);
+        last_alive_.store(0, std::memory_order_relaxed);
+        error_code_.store(0, std::memory_order_relaxed);
+
+        allocator_.init(
+            descriptors_.data(), kWindowSize, &current_index_, &last_alive_, heap_buf_.data(), kHeapSize, &error_code_
+        );
+    }
+
+    // Simulate the scheduler consuming tasks up to (exclusive) task_id
+    // by advancing last_alive and setting packed_buffer_end on the consumed descriptor.
+    void consume_up_to(int32_t task_id, uint64_t heap_tail_offset) {
+        // Set the packed_buffer_end on the descriptor that last_alive-1 maps to
+        // so update_heap_tail can derive the tail.
+        int32_t last_consumed = task_id - 1;
+        descriptors_[last_consumed & (kWindowSize - 1)].packed_buffer_end =
+            static_cast<char *>(static_cast<void *>(heap_buf_.data())) + heap_tail_offset;
+        last_alive_.store(task_id, std::memory_order_release);
+    }
+
+    PTO2TaskAllocator allocator_;
+    std::vector<PTO2TaskDescriptor> descriptors_;
+    std::vector<char> heap_buf_;
+    std::atomic<int32_t> current_index_{0};
+    std::atomic<int32_t> last_alive_{0};
+    std::atomic<int32_t> error_code_{0};
+};
+
+// =============================================================================
+// TaskAllocator: init and state queries
+// =============================================================================
+
+TEST_F(TaskAllocatorTest, InitialState) {
+    EXPECT_EQ(allocator_.window_size(), kWindowSize);
+    EXPECT_EQ(allocator_.active_count(), 0);
+    EXPECT_EQ(allocator_.heap_top(), 0u);
+    EXPECT_EQ(allocator_.heap_capacity(), kHeapSize);
+    EXPECT_EQ(allocator_.heap_available(), kHeapSize);
+}
+
+// =============================================================================
+// TaskAllocator: single alloc with output_size=0
+// =============================================================================
+
+TEST_F(TaskAllocatorTest, AllocZeroOutputSize) {
+    auto result = allocator_.alloc(0);
+    ASSERT_FALSE(result.failed());
+    EXPECT_EQ(result.task_id, 0);
+    EXPECT_EQ(result.slot, 0);
+    // packed_base should be heap_base + 0 (non-null)
+    EXPECT_NE(result.packed_base, nullptr);
+    // packed_end == packed_base when output_size == 0
+    EXPECT_EQ(result.packed_base, result.packed_end);
+    // Heap top should not advance for zero-size alloc
+    EXPECT_EQ(allocator_.heap_top(), 0u);
+}
+
+// =============================================================================
+// TaskAllocator: single alloc with non-zero size
+// =============================================================================
+
+TEST_F(TaskAllocatorTest, AllocNonZeroSize) {
+    auto result = allocator_.alloc(100);
+    ASSERT_FALSE(result.failed());
+    EXPECT_EQ(result.task_id, 0);
+    EXPECT_EQ(result.slot, 0);
+    EXPECT_NE(result.packed_base, nullptr);
+    // 100 bytes aligned up to PTO2_ALIGN_SIZE (64) = 128
+    uint64_t expected_aligned = PTO2_ALIGN_UP(100u, PTO2_ALIGN_SIZE);
+    EXPECT_EQ(expected_aligned, 128u);
+    EXPECT_EQ(allocator_.heap_top(), expected_aligned);
+    EXPECT_EQ(
+        static_cast<char *>(result.packed_end) - static_cast<char *>(result.packed_base),
+        static_cast<ptrdiff_t>(expected_aligned)
+    );
+}
+
+// =============================================================================
+// TaskAllocator: sequential allocs produce sequential task IDs
+// =============================================================================
+
+TEST_F(TaskAllocatorTest, SequentialTaskIds) {
+    for (int i = 0; i < 5; i++) {
+        auto result = allocator_.alloc(0);
+        ASSERT_FALSE(result.failed()) << "Alloc failed at i=" << i;
+        EXPECT_EQ(result.task_id, i);
+        EXPECT_EQ(result.slot, i & (kWindowSize - 1));
+    }
+    EXPECT_EQ(allocator_.active_count(), 5);
+}
+
+// =============================================================================
+// TaskAllocator: alignment of output_size to PTO2_ALIGN_SIZE
+// =============================================================================
+
+TEST_F(TaskAllocatorTest, OutputSizeAlignment) {
+    // 1 byte -> aligned to 64
+    auto r1 = allocator_.alloc(1);
+    ASSERT_FALSE(r1.failed());
+    EXPECT_EQ(allocator_.heap_top(), 64u);
+
+    // Another 33 bytes -> aligned to 64, total 128
+    auto r2 = allocator_.alloc(33);
+    ASSERT_FALSE(r2.failed());
+    EXPECT_EQ(allocator_.heap_top(), 128u);
+
+    // Exactly 64 bytes -> stays 64, total 192
+    auto r3 = allocator_.alloc(64);
+    ASSERT_FALSE(r3.failed());
+    EXPECT_EQ(allocator_.heap_top(), 192u);
+}
+
+// =============================================================================
+// TaskAllocator: try_bump_heap exact fit at end (space_at_end == alloc_size)
+// =============================================================================
+
+TEST_F(TaskAllocatorTest, HeapExactFitAtEnd) {
+    // Heap size is 1024. Allocate 960 bytes (15 * 64) to leave exactly 64 at end.
+    // Then allocate exactly 64 which should succeed (space_at_end >= alloc_size).
+    auto r1 = allocator_.alloc(960);
+    ASSERT_FALSE(r1.failed());
+    EXPECT_EQ(allocator_.heap_top(), 960u);
+
+    auto r2 = allocator_.alloc(64);
+    ASSERT_FALSE(r2.failed());
+    EXPECT_EQ(allocator_.heap_top(), 1024u);
+    // Result pointer should be at heap_base + 960
+    EXPECT_EQ(static_cast<char *>(r2.packed_base), heap_buf_.data() + 960);
+}
+
+// =============================================================================
+// TaskAllocator: try_bump_heap wrap guard intentionally rejects tail == alloc_size
+//
+// The wrap guard `tail > alloc_size` uses strict > to prevent full/empty
+// ambiguity.  If the allocation were allowed, heap_top would advance to
+// alloc_size == tail, making top == tail.  Because top == tail is the
+// canonical "empty" state, the ring could not distinguish "completely full"
+// from "completely empty", causing subsequent allocations to overwrite
+// live data.  Sacrificing one aligned quantum of capacity is the standard
+// circular-buffer technique to avoid this.
+// =============================================================================
+
+TEST_F(TaskAllocatorTest, HeapWrapGuardRejectsTailEqualsAllocSize) {
+    // Fill heap completely: allocate 1024 bytes total
+    auto r1 = allocator_.alloc(1024);
+    ASSERT_FALSE(r1.failed());
+    EXPECT_EQ(allocator_.heap_top(), 1024u);
+
+    // Consume task 0, setting tail to exactly 64 (one aligned block)
+    consume_up_to(1, 64);
+
+    // top=1024 (== heap_size), tail=64, alloc_size=64
+    // space_at_end = 0, wrap check: tail(64) > alloc_size(64) -> FALSE
+    // Allocation is correctly rejected to preserve the top != tail invariant.
+    auto r2 = allocator_.alloc(64);
+    EXPECT_TRUE(r2.failed()) << "wrap guard must reject when tail == alloc_size (full/empty ambiguity)";
+}
+
+// =============================================================================
+// TaskAllocator: try_bump_heap wrap-around success (tail > alloc_size)
+// =============================================================================
+
+TEST_F(TaskAllocatorTest, HeapWrapAroundSuccess) {
+    // Fill heap completely: allocate 1024 bytes
+    auto r1 = allocator_.alloc(1024);
+    ASSERT_FALSE(r1.failed());
+    EXPECT_EQ(allocator_.heap_top(), 1024u);
+
+    // Consume task 0, setting tail to 128 (more than one block)
+    consume_up_to(1, 128);
+
+    // Now: top=1024 (== heap_size), tail=128
+    // space_at_end = 0, so wrap-around check: tail(128) > alloc_size(64)? => TRUE
+    // Wraps to beginning: result = heap_base, top = 64
+    auto r2 = allocator_.alloc(64);
+    ASSERT_FALSE(r2.failed());
+    EXPECT_EQ(r2.packed_base, static_cast<void *>(heap_buf_.data()));
+    EXPECT_EQ(allocator_.heap_top(), 64u);
+}
+
+// =============================================================================
+// TaskAllocator: try_bump_heap top < tail linear-gap guard rejects exact fit
+//
+// The linear-gap guard `tail - top > alloc_size` uses strict > for the same
+// full/empty ambiguity reason as the wrap guard.  If exact fit were allowed,
+// heap_top would advance to tail, making top == tail (looks empty).  The
+// next allocation would see top >= tail with space_at_end = heap_size - top
+// and allocate into the region that still contains live data from the prior
+// wrap.  The strict > sacrifices one quantum to keep top != tail.
+// =============================================================================
+
+TEST_F(TaskAllocatorTest, HeapLinearGapGuardRejectsExactFit) {
+    // Fill heap, then wrap around to set up top < tail.
+    auto r1 = allocator_.alloc(960);
+    ASSERT_FALSE(r1.failed());
+
+    // Consume task 0, tail moves to 960
+    consume_up_to(1, 960);
+
+    // Allocate 128 bytes: space_at_end = 1024-960 = 64, not enough for 128.
+    // Wrap-around: tail(960) > 128 => TRUE, wraps.
+    auto r2 = allocator_.alloc(128);
+    ASSERT_FALSE(r2.failed());
+    EXPECT_EQ(allocator_.heap_top(), 128u);
+
+    // Now top=128, tail=960 (top < tail)
+    // gap = tail - top = 960 - 128 = 832
+    // Allocate exactly 832 bytes: gap(832) > alloc_size(832) -> FALSE
+    // Correctly rejected to preserve top != tail invariant.
+    auto r3 = allocator_.alloc(832);
+    EXPECT_TRUE(r3.failed()) << "linear-gap guard must reject exact fit (full/empty ambiguity)";
+}
+
+// =============================================================================
+// TaskAllocator: try_bump_heap top < tail insufficient space
+// =============================================================================
+
+TEST_F(TaskAllocatorTest, HeapTopLessThanTailInsufficientSpace) {
+    // Set up top < tail scenario
+    auto r1 = allocator_.alloc(960);
+    ASSERT_FALSE(r1.failed());
+    consume_up_to(1, 960);
+
+    auto r2 = allocator_.alloc(128);
+    ASSERT_FALSE(r2.failed());
+    EXPECT_EQ(allocator_.heap_top(), 128u);
+
+    // Now top=128, tail=960. Available = 832.
+    // Try to allocate 896 (> 832): should fail (deadlock after spin).
+    auto r3 = allocator_.alloc(896);
+    EXPECT_TRUE(r3.failed());
+    EXPECT_NE(error_code_.load(), 0);
+}
+
+// =============================================================================
+// TaskAllocator: update_heap_tail from consumed task
+// =============================================================================
+
+TEST_F(TaskAllocatorTest, UpdateHeapTailFromConsumedTask) {
+    auto r1 = allocator_.alloc(256);
+    ASSERT_FALSE(r1.failed());
+    EXPECT_EQ(allocator_.heap_top(), 256u);
+
+    // Before consumption, heap_available should be heap_size - top = 768
+    EXPECT_EQ(allocator_.heap_available(), kHeapSize - 256u);
+
+    // Consume task 0, tail moves to 256
+    consume_up_to(1, 256);
+
+    // Force the allocator to observe the new last_alive by doing another alloc
+    auto r2 = allocator_.alloc(0);
+    ASSERT_FALSE(r2.failed());
+
+    // After update_heap_tail, full heap should be available again
+    // top=256, tail=256, so available = heap_size - top = 768 (at_end)
+    // Actually: top >= tail, at_end = 1024-256=768, at_begin = 256
+    // heap_available returns max(at_end, at_begin) = 768
+    EXPECT_EQ(allocator_.heap_available(), kHeapSize - 256u);
+}
+
+// =============================================================================
+// TaskAllocator: update_heap_tail at task 0 boundary
+//
+// When last_alive=1, update_heap_tail reads descriptors[(1-1) & mask] = descriptors[0].
+// This is task 0's descriptor, which should have valid packed_buffer_end.
+// =============================================================================
+
+TEST_F(TaskAllocatorTest, UpdateHeapTailAtTask0) {
+    // Allocate task 0 with some heap
+    auto r1 = allocator_.alloc(64);
+    ASSERT_FALSE(r1.failed());
+    EXPECT_EQ(r1.task_id, 0);
+
+    // Set packed_buffer_end on task 0's descriptor
+    descriptors_[0].packed_buffer_end = static_cast<char *>(static_cast<void *>(heap_buf_.data())) + 64;
+
+    // Advance last_alive to 1 (meaning task 0 is consumed)
+    last_alive_.store(1, std::memory_order_release);
+
+    // The next alloc triggers update_heap_tail(1), reading descriptors[0].
+    auto r2 = allocator_.alloc(0);
+    ASSERT_FALSE(r2.failed());
+    EXPECT_EQ(r2.task_id, 1);
+}
+
+// =============================================================================
+// TaskAllocator: update_heap_tail idempotent
+// =============================================================================
+
+TEST_F(TaskAllocatorTest, UpdateHeapTailIdempotent) {
+    auto r1 = allocator_.alloc(128);
+    ASSERT_FALSE(r1.failed());
+
+    consume_up_to(1, 128);
+
+    // Multiple allocs should not cause heap_tail to drift
+    auto r2 = allocator_.alloc(0);
+    ASSERT_FALSE(r2.failed());
+    uint64_t avail_after_first = allocator_.heap_available();
+
+    auto r3 = allocator_.alloc(0);
+    ASSERT_FALSE(r3.failed());
+    EXPECT_EQ(allocator_.heap_available(), avail_after_first);
+}
+
+// =============================================================================
+// TaskAllocator: heap_available for top>=tail and top<tail
+// =============================================================================
+
+TEST_F(TaskAllocatorTest, HeapAvailableTopGeTail) {
+    // Initially top=0, tail=0: available = heap_size - 0 = 1024
+    EXPECT_EQ(allocator_.heap_available(), kHeapSize);
+
+    auto r1 = allocator_.alloc(256);
+    ASSERT_FALSE(r1.failed());
+    // top=256, tail=0: at_end=768, at_begin=0, available=768
+    EXPECT_EQ(allocator_.heap_available(), kHeapSize - 256u);
+}
+
+TEST_F(TaskAllocatorTest, HeapAvailableTopLtTail) {
+    // Set up top < tail
+    auto r1 = allocator_.alloc(960);
+    ASSERT_FALSE(r1.failed());
+    consume_up_to(1, 960);
+
+    // Wrap around
+    auto r2 = allocator_.alloc(128);
+    ASSERT_FALSE(r2.failed());
+    // top=128, tail=960: available = 960 - 128 = 832
+    EXPECT_EQ(allocator_.heap_available(), 832u);
+}
+
+// =============================================================================
+// DepListPool Test Fixture
+// =============================================================================
+
+class DepListPoolTest : public ::testing::Test {
+protected:
+    void SetUp() override {
+        entries_.resize(kPoolCapacity);
+        std::memset(entries_.data(), 0, sizeof(PTO2DepListEntry) * kPoolCapacity);
+        error_code_.store(0, std::memory_order_relaxed);
+
+        pool_.init(entries_.data(), kPoolCapacity, &error_code_);
+    }
+
+    static constexpr int32_t kPoolCapacity = 8;
+
+    PTO2DepListPool pool_;
+    std::vector<PTO2DepListEntry> entries_;
+    std::atomic<int32_t> error_code_{0};
+};
+
+// =============================================================================
+// DepListPool: init (top=1, tail=1, entry 0 is NULL)
+// =============================================================================
+
+TEST_F(DepListPoolTest, InitialState) {
+    EXPECT_EQ(pool_.used(), 0);
+    EXPECT_EQ(pool_.available(), kPoolCapacity);
+
+    // Entry 0 should be NULL marker
+    EXPECT_EQ(entries_[0].slot_state, nullptr);
+    EXPECT_EQ(entries_[0].next, nullptr);
+}
+
+// =============================================================================
+// DepListPool: single alloc
+// =============================================================================
+
+TEST_F(DepListPoolTest, SingleAlloc) {
+    PTO2DepListEntry *entry = pool_.alloc();
+    ASSERT_NE(entry, nullptr);
+    EXPECT_EQ(pool_.used(), 1);
+    EXPECT_EQ(pool_.available(), kPoolCapacity - 1);
+
+    // The allocated entry should be at index 1 (top was 1, mod capacity)
+    EXPECT_EQ(entry, &entries_[1]);
+}
+
+// =============================================================================
+// DepListPool: overflow detection
+// =============================================================================
+
+TEST_F(DepListPoolTest, OverflowDetection) {
+    // Allocate until full (capacity entries used)
+    for (int i = 0; i < kPoolCapacity; i++) {
+        PTO2DepListEntry *e = pool_.alloc();
+        ASSERT_NE(e, nullptr) << "Unexpected failure at alloc " << i;
+    }
+    EXPECT_EQ(pool_.used(), kPoolCapacity);
+    EXPECT_EQ(pool_.available(), 0);
+
+    // Next alloc should fail (overflow)
+    PTO2DepListEntry *overflow = pool_.alloc();
+    EXPECT_EQ(overflow, nullptr);
+    EXPECT_NE(error_code_.load(), 0);
+    EXPECT_EQ(error_code_.load(), PTO2_ERROR_DEP_POOL_OVERFLOW);
+}
+
+// =============================================================================
+// DepListPool: prepend chain integrity
+// =============================================================================
+
+TEST_F(DepListPoolTest, PrependChainIntegrity) {
+    PTO2TaskSlotState slot_a{};
+    PTO2TaskSlotState slot_b{};
+    PTO2TaskSlotState slot_c{};
+
+    // Build a chain: NULL -> slot_a -> slot_b -> slot_c (prepend order)
+    PTO2DepListEntry *head = nullptr;
+
+    head = pool_.prepend(head, &slot_a);
+    ASSERT_NE(head, nullptr);
+    EXPECT_EQ(head->slot_state, &slot_a);
+    EXPECT_EQ(head->next, nullptr);
+
+    head = pool_.prepend(head, &slot_b);
+    ASSERT_NE(head, nullptr);
+    EXPECT_EQ(head->slot_state, &slot_b);
+    EXPECT_EQ(head->next->slot_state, &slot_a);
+    EXPECT_EQ(head->next->next, nullptr);
+
+    head = pool_.prepend(head, &slot_c);
+    ASSERT_NE(head, nullptr);
+    EXPECT_EQ(head->slot_state, &slot_c);
+    EXPECT_EQ(head->next->slot_state, &slot_b);
+    EXPECT_EQ(head->next->next->slot_state, &slot_a);
+    EXPECT_EQ(head->next->next->next, nullptr);
+}
+
+// =============================================================================
+// DepListPool: advance_tail
+// =============================================================================
+
+TEST_F(DepListPoolTest, AdvanceTail) {
+    // Allocate 4 entries
+    for (int i = 0; i < 4; i++) {
+        pool_.alloc();
+    }
+    EXPECT_EQ(pool_.used(), 4);
+    EXPECT_EQ(pool_.available(), kPoolCapacity - 4);
+
+    // Advance tail by 3 (from 1 to 4)
+    pool_.advance_tail(4);
+    EXPECT_EQ(pool_.used(), 1);
+    EXPECT_EQ(pool_.available(), kPoolCapacity - 1);
+}
+
+// =============================================================================
+// DepListPool: advance_tail backwards (no-op)
+// =============================================================================
+
+TEST_F(DepListPoolTest, AdvanceTailBackwardsNoop) {
+    pool_.alloc();
+    pool_.alloc();
+    pool_.advance_tail(3);
+    int32_t used_after = pool_.used();
+
+    // Trying to advance backwards should be a no-op
+    pool_.advance_tail(2);
+    EXPECT_EQ(pool_.used(), used_after);
+
+    // Same value should also be a no-op
+    pool_.advance_tail(3);
+    EXPECT_EQ(pool_.used(), used_after);
+}
diff --git a/tests/ut/cpp/pto2_a2a3/test_ring_buffer_edge.cpp b/tests/ut/cpp/pto2_a2a3/test_ring_buffer_edge.cpp
new file mode 100644
index 000000000..e380012d1
--- /dev/null
+++ b/tests/ut/cpp/pto2_a2a3/test_ring_buffer_edge.cpp
@@ -0,0 +1,478 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Edge-case tests for PTO2TaskAllocator and PTO2DepListPool.
+ *
+ * Each test targets a specific code path, boundary condition, or potential
+ * latent bug discovered through line-by-line analysis of pto_ring_buffer.h.
+ *
+ * Note: the unified PTO2TaskAllocator replaces the previous separate
+ *   PTO2HeapRing + PTO2TaskRing. Because the allocator is single-threaded
+ *   (orchestrator thread), CAS/concurrency edge cases that applied to the
+ *   old design are no longer meaningful and have been removed. The wrap /
+ *   fragmentation / zero-size tests remain and exercise try_bump_heap and
+ *   the task window check.
+ *
+ * ============================================================================
+ * DESIGN CONTRACTS -- PTO2TaskAllocator (try_bump_heap)
+ * ============================================================================
+ *
+ * DC-1: Wrap-around guard uses `tail > alloc_size` (strict >).  When
+ *   tail == alloc_size the wrap branch returns nullptr.  This is
+ *   intentional: allowing the allocation would set heap_top_ =
+ *   alloc_size == tail, creating the classic circular-buffer full/empty
+ *   ambiguity where top == tail must mean "empty".  The strict >
+ *   sacrifices one aligned quantum of capacity to keep top != tail
+ *   whenever the buffer has live data.
+ *
+ * DC-3: `heap_available()` returns max(at_end, at_begin), not the sum.
+ *   A single allocation cannot split across the wrap boundary, so max
+ *   is the right semantic -- callers should treat the return value as
+ *   "largest contiguous allocation possible", not "total free bytes".
+ *
+ * DC-9: Zero-size allocation is a no-op that returns the current top
+ *   without advancing. Two consecutive zero-size allocs return the
+ *   SAME pointer. Semantically correct for a bump allocator.
+ *
+ * DC-10: Wrap path writes new_top = alloc_size; the wasted space at
+ *   the end of the heap (between old top and heap_size) is not
+ *   reclaimed because tail is advanced by packed_buffer_end, not by
+ *   heap_size. Inherent to ring-buffer algorithms; acceptable
+ *   fragmentation cost for allocator simplicity.
+ *
+ * EDGE-1: top == tail == 0 (initial state). space_at_end = heap_size.
+ * EDGE-2: top == heap_size (exactly at end). space_at_end = 0, must wrap.
+ *
+ * ============================================================================
+ * DESIGN CONTRACTS -- Task window (via PTO2TaskAllocator::alloc)
+ * ============================================================================
+ *
+ * EDGE-5: window_size = 1. Check `local_task_id_ - last_alive + 1 < 1`
+ *   is always false -> every allocation spins forever (deadlock). This
+ *   is undefined/unsupported configuration.
+ *
+ * ============================================================================
+ * DESIGN CONTRACTS -- DepListPool
+ * ============================================================================
+ *
+ * Note: earlier comments in this file called `base[0]` a "sentinel" that
+ * must never be overwritten.  That is **not** how the current src works.
+ * The list terminator is literal `nullptr` (see pto_scheduler.h fanout
+ * walk and `PTO2TaskSlotState::fanout_head = nullptr` initialization in
+ * pto_runtime2_types.h).  `base[0]` is a normal pool entry; the init
+ * clearing in `DepListPool::init` is incidental, not an invariant.  The
+ * historical `SentinelOverwrite` / `SentinelDataCorruption` /
+ * `MultiCyclesSentinelIntegrity` tests have been removed; they were
+ * asserting behavior the src never promised.
+ *
+ * DC-7: `advance_tail(new_tail)` only advances if new_tail > tail; it
+ *   does not validate new_tail <= top. The caller (orchestrator) is
+ *   contracted to pass monotonically advancing, top-bounded values.
+ *   Documented as an API contract; not a live defect.
+ */
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <climits>
+#include <cstring>
+#include <set>
+#include <vector>
+
+#include "pto_ring_buffer.h"
+
+// =============================================================================
+// Helper: advance last_alive so try_bump_heap can derive heap_tail from the
+// consumed task's packed_buffer_end.
+// =============================================================================
+static void consume_up_to(
+    std::vector<PTO2TaskDescriptor> &descriptors, std::atomic<int32_t> &last_alive, void *heap_base,
+    int32_t window_size, int32_t new_last_alive, uint64_t heap_tail_offset
+) {
+    int32_t last_consumed = new_last_alive - 1;
+    descriptors[last_consumed & (window_size - 1)].packed_buffer_end =
+        static_cast<char *>(heap_base) + heap_tail_offset;
+    last_alive.store(new_last_alive, std::memory_order_release);
+}
+
+// =============================================================================
+// TaskAllocator edge-case fixture
+// =============================================================================
+class TaskAllocatorEdgeTest : public ::testing::Test {
+protected:
+    static constexpr int32_t WINDOW_SIZE = 16;
+    static constexpr uint64_t HEAP_SIZE = 4096;
+
+    std::vector<PTO2TaskDescriptor> descriptors;
+    alignas(64) uint8_t heap_buf[HEAP_SIZE]{};
+    std::atomic<int32_t> current_index{0};
+    std::atomic<int32_t> last_alive{0};
+    std::atomic<int32_t> error_code{PTO2_ERROR_NONE};
+    PTO2TaskAllocator allocator{};
+
+    void SetUp() override {
+        descriptors.assign(WINDOW_SIZE, PTO2TaskDescriptor{});
+        std::memset(heap_buf, 0, sizeof(heap_buf));
+        current_index.store(0);
+        last_alive.store(0);
+        error_code.store(PTO2_ERROR_NONE);
+        allocator.init(descriptors.data(), WINDOW_SIZE, &current_index, &last_alive, heap_buf, HEAP_SIZE, &error_code);
+    }
+};
+
+// ---------------------------------------------------------------------------
+// DESIGN: Wrap guard `tail > alloc_size` is intentionally strict.
+// When tail == alloc_size, accepting the allocation would set top == tail,
+// creating full/empty ambiguity.  The guard sacrifices one quantum.
+// ---------------------------------------------------------------------------
+TEST_F(TaskAllocatorEdgeTest, WrapGuard_TailEqualsAllocSize) {
+    // Fill heap to end.
+    auto r1 = allocator.alloc(HEAP_SIZE);
+    ASSERT_FALSE(r1.failed());
+    EXPECT_EQ(allocator.heap_top(), HEAP_SIZE);
+
+    // Consume task 0 to advance heap_tail to exactly 64.
+    consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, 64);
+
+    // top == HEAP_SIZE, tail == 64, request 64 bytes.
+    // space_at_end = 0. Wrap: tail(64) > 64 -> FALSE -> correctly rejected.
+    auto r2 = allocator.alloc(64);
+    EXPECT_TRUE(r2.failed()) << "Wrap guard correctly rejects when tail == alloc_size (full/empty ambiguity)";
+}
+
+// ---------------------------------------------------------------------------
+// EDGE-2: top at exact end of heap (top == heap_size). After a full
+// wrap the allocation must land at the base.
+// ---------------------------------------------------------------------------
+TEST_F(TaskAllocatorEdgeTest, TopAtExactEnd) {
+    auto r1 = allocator.alloc(HEAP_SIZE);
+    ASSERT_FALSE(r1.failed());
+    EXPECT_EQ(allocator.heap_top(), HEAP_SIZE);
+
+    // Advance tail so the wrap path has enough room for the next alloc.
+    consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, 128);
+
+    // top(HEAP_SIZE) >= tail(128). space_at_end = 0.
+    // Wrap: tail(128) > 64 -> true -> new_top = 64, result = base.
+    auto r2 = allocator.alloc(64);
+    ASSERT_FALSE(r2.failed());
+    EXPECT_EQ(r2.packed_base, static_cast<void *>(heap_buf));
+    EXPECT_EQ(allocator.heap_top(), 64u);
+}
+
+// ---------------------------------------------------------------------------
+// BUG-CANDIDATE-3: heap_available() reports max(at_end, at_begin), not the sum.
+// ---------------------------------------------------------------------------
+TEST_F(TaskAllocatorEdgeTest, AvailableFragmentation) {
+    // Create a fragmented state: top near middle/high, tail in middle.
+    auto r1 = allocator.alloc(3008);  // top ~ 3008 (already aligned to 64)
+    ASSERT_FALSE(r1.failed());
+    uint64_t actual_top = allocator.heap_top();
+
+    consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, 1024);
+
+    // Force the allocator to refresh its cached heap_tail.
+    auto r_probe = allocator.alloc(0);
+    ASSERT_FALSE(r_probe.failed());
+
+    uint64_t avail = allocator.heap_available();
+    uint64_t at_end = HEAP_SIZE - actual_top;
+    uint64_t at_begin = 1024;
+    EXPECT_EQ(avail, std::max(at_end, at_begin));
+
+    // Total free bytes (at_end + at_begin) may exceed what a single alloc can
+    // take, because allocations never split across the wrap boundary.
+    EXPECT_LT(avail, at_end + at_begin);
+}
+
+// ---------------------------------------------------------------------------
+// BUG-CANDIDATE-9: Zero-size allocation does not advance the heap pointer.
+// Two consecutive zero-size allocs return the SAME address (aliased).
+// ---------------------------------------------------------------------------
+TEST_F(TaskAllocatorEdgeTest, ZeroSizeAllocation) {
+    auto r1 = allocator.alloc(0);
+    auto r2 = allocator.alloc(0);
+    ASSERT_FALSE(r1.failed());
+    ASSERT_FALSE(r2.failed());
+
+    EXPECT_EQ(r1.packed_base, r2.packed_base) << "Zero-size allocs return same address";
+    EXPECT_EQ(r1.packed_base, r1.packed_end) << "packed_end == packed_base for zero-size";
+    EXPECT_EQ(r2.packed_base, r2.packed_end);
+    EXPECT_EQ(allocator.heap_top(), 0u) << "top doesn't advance for zero-size allocs";
+}
+
+// ---------------------------------------------------------------------------
+// BUG-CANDIDATE-10: Wrap-path wasted space accumulation.
+// When wrapping, space between old top and heap_size is leaked.
+// ---------------------------------------------------------------------------
+TEST_F(TaskAllocatorEdgeTest, WrapPathWastedSpace) {
+    // Allocate 4000 bytes -> top rounds to 4032 (aligned).
+    auto r1 = allocator.alloc(4000);
+    ASSERT_FALSE(r1.failed());
+    uint64_t top_after = allocator.heap_top();
+    EXPECT_GE(top_after, 4000u);
+    EXPECT_LT(top_after, HEAP_SIZE);  // Some trailing space remains unused.
+
+    // Reclaim task 0: tail moves up to match top (logically empty).
+    consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, top_after);
+
+    // space_at_end = HEAP_SIZE - top_after (e.g. 64). < 128 -> must wrap.
+    // After the wrap, the 64 trailing bytes are unreachable.
+    auto r2 = allocator.alloc(128);
+    ASSERT_FALSE(r2.failed());
+    EXPECT_EQ(r2.packed_base, static_cast<void *>(heap_buf)) << "Allocation wrapped to beginning";
+
+    // Available now reflects the new (post-wrap) top and the stale tail.
+    uint64_t avail = allocator.heap_available();
+    EXPECT_LT(avail, HEAP_SIZE) << "Wasted space at end reduces available capacity";
+}
+
+// ---------------------------------------------------------------------------
+// Allocation of exactly heap_size: consumes entire heap in one shot.
+// ---------------------------------------------------------------------------
+TEST_F(TaskAllocatorEdgeTest, AllocExactlyHeapSize) {
+    auto r1 = allocator.alloc(HEAP_SIZE);
+    ASSERT_FALSE(r1.failed());
+    EXPECT_EQ(r1.packed_base, static_cast<void *>(heap_buf));
+    EXPECT_EQ(allocator.heap_top(), HEAP_SIZE);
+
+    // No more space (and no reclamation) -> next alloc spins to deadlock.
+    auto r2 = allocator.alloc(64);
+    EXPECT_TRUE(r2.failed()) << "No space after full allocation";
+    EXPECT_EQ(error_code.load(), PTO2_ERROR_HEAP_RING_DEADLOCK);
+}
+
+// ---------------------------------------------------------------------------
+// Allocation larger than heap_size: must fail (heap deadlock).
+// ---------------------------------------------------------------------------
+TEST_F(TaskAllocatorEdgeTest, AllocLargerThanHeap) {
+    auto r = allocator.alloc(HEAP_SIZE * 2);
+    EXPECT_TRUE(r.failed()) << "Cannot allocate more than heap size";
+    EXPECT_EQ(error_code.load(), PTO2_ERROR_HEAP_RING_DEADLOCK);
+}
+
+// ---------------------------------------------------------------------------
+// Task window saturates: allocator.alloc blocks when
+// (local_task_id - last_alive + 1) >= window_size.
+// ---------------------------------------------------------------------------
+TEST_F(TaskAllocatorEdgeTest, TaskWindowSaturates) {
+    // Allocate until the window is full: window allows window_size - 1 active.
+    for (int i = 0; i < WINDOW_SIZE - 1; i++) {
+        auto r = allocator.alloc(0);
+        ASSERT_FALSE(r.failed()) << "Alloc failed at i=" << i;
+        EXPECT_EQ(r.task_id, i);
+    }
+    EXPECT_EQ(allocator.active_count(), WINDOW_SIZE - 1);
+
+    // The next alloc would push active_count to window_size and is refused
+    // (spins until deadlock since last_alive is not advancing).
+    auto overflow = allocator.alloc(0);
+    EXPECT_TRUE(overflow.failed());
+    EXPECT_EQ(error_code.load(), PTO2_ERROR_FLOW_CONTROL_DEADLOCK);
+}
+
+// ---------------------------------------------------------------------------
+// Slot mapping uses `task_id & window_mask` -- with a power-of-two window
+// this is equivalent to modulo. Every consecutive window_size task IDs
+// visit every slot exactly once.
+// ---------------------------------------------------------------------------
+TEST_F(TaskAllocatorEdgeTest, SlotMappingPowerOfTwoWindow) {
+    std::set<int32_t> slots;
+    for (int i = 0; i < WINDOW_SIZE; i++) {
+        // Advance last_alive so we can keep allocating past the window.
+        consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, i, 0);
+        auto r = allocator.alloc(0);
+        ASSERT_FALSE(r.failed());
+        EXPECT_EQ(r.slot, r.task_id & (WINDOW_SIZE - 1));
+        slots.insert(r.slot);
+    }
+    EXPECT_EQ(slots.size(), static_cast<size_t>(WINDOW_SIZE))
+        << "Every slot should be visited exactly once over one window cycle";
+}
+
+// ---------------------------------------------------------------------------
+// BUG-CANDIDATE-11 (adapted): Task IDs grow monotonically as int32_t.
+// Near INT32_MAX, fetch-like behavior would overflow in the old design;
+// the new allocator increments a local counter and publishes it -- the
+// same signed-overflow concern applies but is cosmetic here since we
+// use `task_id & window_mask` for indexing.
+// ---------------------------------------------------------------------------
+TEST_F(TaskAllocatorEdgeTest, TaskIdNearInt32Max) {
+    // Seed the shared counter near INT32_MAX and re-init so the allocator
+    // picks up the seed as its local counter.
+    current_index.store(INT32_MAX - 2);
+    last_alive.store(INT32_MAX - 2);
+    allocator.init(descriptors.data(), WINDOW_SIZE, &current_index, &last_alive, heap_buf, HEAP_SIZE, &error_code);
+
+    auto r1 = allocator.alloc(0);
+    ASSERT_FALSE(r1.failed());
+    EXPECT_EQ(r1.task_id, INT32_MAX - 2);
+    EXPECT_EQ(r1.slot, (INT32_MAX - 2) & (WINDOW_SIZE - 1));
+
+    auto r2 = allocator.alloc(0);
+    ASSERT_FALSE(r2.failed());
+    EXPECT_EQ(r2.task_id, INT32_MAX - 1);
+
+    auto r3 = allocator.alloc(0);
+    ASSERT_FALSE(r3.failed());
+    EXPECT_EQ(r3.task_id, INT32_MAX);
+    // Slot mask still yields a valid slot regardless of sign.
+    EXPECT_GE(r3.slot, 0);
+    EXPECT_LT(r3.slot, WINDOW_SIZE);
+}
+
+// =============================================================================
+// DepListPool edge-case fixture
+// =============================================================================
+class DepPoolEdgeTest : public ::testing::Test {
+protected:
+    static constexpr int32_t POOL_CAP = 8;
+    PTO2DepListEntry entries[POOL_CAP]{};
+    std::atomic<int32_t> error_code{PTO2_ERROR_NONE};
+    PTO2DepListPool pool{};
+
+    void SetUp() override {
+        std::memset(entries, 0, sizeof(entries));
+        error_code.store(PTO2_ERROR_NONE);
+        pool.init(entries, POOL_CAP, &error_code);
+    }
+};
+
+// ---------------------------------------------------------------------------
+// DC-7 (contract): advance_tail does not validate new_tail <= top.
+// Caller (orchestrator) is contracted to pass monotonic top-bounded values;
+// these two tests document what happens if that contract is violated, to
+// anchor the API shape -- they are not bug reports.
+// ---------------------------------------------------------------------------
+TEST_F(DepPoolEdgeTest, AdvanceTailBeyondTop_ContractViolationProducesNegativeUsed) {
+    pool.alloc();  // top=2
+    pool.alloc();  // top=3
+
+    pool.advance_tail(100);  // caller contract violation
+
+    int32_t u = pool.used();       // top(3) - tail(100) = -97
+    int32_t a = pool.available();  // capacity(8) - (-97) = 105
+
+    EXPECT_LT(u, 0) << "used() goes negative when tail > top";
+    EXPECT_GT(a, pool.capacity) << "available() exceeds capacity when tail > top";
+}
+
+TEST_F(DepPoolEdgeTest, AdvanceTailBeyondTop_ContractViolationLetsAllocProceed) {
+    pool.alloc();            // top=2
+    pool.advance_tail(100);  // caller contract violation
+
+    // used() is negative -> overflow check (used >= capacity) is false -> alloc proceeds.
+    PTO2DepListEntry *e = pool.alloc();
+    EXPECT_NE(e, nullptr) << "Alloc succeeds with corrupted tail (negative used)";
+    EXPECT_LT(pool.used(), 0) << "Pool state remains corrupted: negative used count";
+}
+
+// ---------------------------------------------------------------------------
+// Prepend chain integrity under pool exhaustion: chain must be walkable.
+// ---------------------------------------------------------------------------
+TEST_F(DepPoolEdgeTest, PrependUnderExhaustion) {
+    PTO2TaskSlotState slots[POOL_CAP]{};
+    PTO2DepListEntry *head = nullptr;
+
+    int count = 0;
+    while (count < POOL_CAP + 5) {  // Try beyond capacity
+        PTO2DepListEntry *new_head = pool.prepend(head, &slots[count % POOL_CAP]);
+        if (!new_head) break;
+        head = new_head;
+        count++;
+    }
+
+    // Walk the chain -- should be intact (no cycles, no overruns).
+    int walk = 0;
+    PTO2DepListEntry *cur = head;
+    while (cur) {
+        walk++;
+        cur = cur->next;
+        if (walk > count + 1) {
+            FAIL() << "Chain has cycle -- walked more entries than allocated";
+            break;
+        }
+    }
+    EXPECT_EQ(walk, count);
+}
+
+// ---------------------------------------------------------------------------
+// Prepend builds linked list correctly: verify each slot_state pointer.
+// ---------------------------------------------------------------------------
+TEST_F(DepPoolEdgeTest, PrependChainCorrectness) {
+    PTO2TaskSlotState slots[5]{};
+    PTO2DepListEntry *head = nullptr;
+
+    for (int i = 0; i < 5; i++) {
+        head = pool.prepend(head, &slots[i]);
+        ASSERT_NE(head, nullptr);
+    }
+
+    // LIFO order: head -> slots[4] -> slots[3] -> ... -> slots[0] -> nullptr.
+    PTO2DepListEntry *cur = head;
+    for (int i = 4; i >= 0; i--) {
+        ASSERT_NE(cur, nullptr);
+        EXPECT_EQ(cur->slot_state, &slots[i]) << "Entry " << (4 - i) << " should point to slots[" << i << "]";
+        cur = cur->next;
+    }
+    EXPECT_EQ(cur, nullptr) << "Chain should terminate with nullptr";
+}
+
+// ---------------------------------------------------------------------------
+// High-water mark accuracy after reclaim cycles (ABI contract: diagnostic field).
+// ---------------------------------------------------------------------------
+TEST_F(DepPoolEdgeTest, HighWaterAccuracy) {
+    for (int i = 0; i < 5; i++)
+        pool.alloc();
+    EXPECT_EQ(pool.high_water, 5);
+
+    pool.advance_tail(4);
+    EXPECT_EQ(pool.high_water, 5) << "High water never decreases";
+
+    for (int i = 0; i < 3; i++)
+        pool.alloc();
+    EXPECT_GE(pool.high_water, 5);
+}
+
+// ---------------------------------------------------------------------------
+// Advance tail backwards is a no-op.
+// ---------------------------------------------------------------------------
+TEST_F(DepPoolEdgeTest, AdvanceTailBackwards) {
+    pool.alloc();
+    pool.alloc();
+    pool.advance_tail(3);
+
+    pool.advance_tail(1);  // Should be no-op.
+    EXPECT_EQ(pool.used(), 0) << "advance_tail backwards is a no-op";
+}
+
+// ---------------------------------------------------------------------------
+// Pool init state verification.
+// ---------------------------------------------------------------------------
+TEST_F(DepPoolEdgeTest, InitState) {
+    EXPECT_EQ(pool.used(), 0) << "initially empty";
+    EXPECT_EQ(pool.available(), POOL_CAP) << "full capacity available";
+    EXPECT_EQ(entries[0].slot_state, nullptr) << "sentinel slot_state is null";
+    EXPECT_EQ(entries[0].next, nullptr) << "sentinel next is null";
+}
+
+// ---------------------------------------------------------------------------
+// Alloc all then overflow: verify error code is set.
+// ---------------------------------------------------------------------------
+TEST_F(DepPoolEdgeTest, OverflowSetsErrorCode) {
+    for (int i = 0; i < POOL_CAP; i++) {
+        pool.alloc();
+    }
+
+    PTO2DepListEntry *overflow_result = pool.alloc();
+    EXPECT_EQ(overflow_result, nullptr) << "Overflow returns nullptr";
+    EXPECT_EQ(error_code.load(), PTO2_ERROR_DEP_POOL_OVERFLOW) << "Error code set on overflow";
+}
diff --git a/tests/ut/cpp/pto2_a2a3/test_runtime_graph.cpp b/tests/ut/cpp/pto2_a2a3/test_runtime_graph.cpp
new file mode 100644
index 000000000..e8b740cc8
--- /dev/null
+++ b/tests/ut/cpp/pto2_a2a3/test_runtime_graph.cpp
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Unit tests for host_build_graph Runtime class.
+ *
+ * Tests task graph construction: add_task, add_successor,
+ * ready task detection, and dependency graph patterns.
+ */
+
+#include <gtest/gtest.h>
+#include "../../../../src/a2a3/runtime/host_build_graph/runtime/runtime.h"
+
+// =============================================================================
+// Test fixture -- allocates a Runtime on the heap (it's very large)
+// =============================================================================
+
+class RuntimeGraphTest : public ::testing::Test {
+protected:
+    Runtime *rt = nullptr;
+
+    void SetUp() override { rt = new Runtime(); }
+
+    void TearDown() override { delete rt; }
+
+    // Helper: add a task with no args
+    int addTask(int func_id = 0, CoreType core_type = CoreType::AIV) {
+        return rt->add_task(nullptr, 0, func_id, core_type);
+    }
+};
+
+// =============================================================================
+// Basic task addition
+// =============================================================================
+
+TEST_F(RuntimeGraphTest, AddTask_MonotonicId) {
+    int id0 = addTask();
+    int id1 = addTask();
+    int id2 = addTask();
+
+    EXPECT_EQ(id0, 0);
+    EXPECT_EQ(id1, 1);
+    EXPECT_EQ(id2, 2);
+    EXPECT_EQ(rt->get_task_count(), 3);
+}
+
+TEST_F(RuntimeGraphTest, AddTask_StoresFields) {
+    uint64_t args[] = {42, 99};
+    int id = rt->add_task(args, 2, /*func_id=*/7, CoreType::AIC);
+
+    Task *t = rt->get_task(id);
+    ASSERT_NE(t, nullptr);
+    EXPECT_EQ(t->func_id, 7);
+    EXPECT_EQ(t->num_args, 2);
+    EXPECT_EQ(t->args[0], 42u);
+    EXPECT_EQ(t->args[1], 99u);
+    EXPECT_EQ(t->core_type, CoreType::AIC);
+}
+
+// =============================================================================
+// Dependency edges
+// =============================================================================
+
+TEST_F(RuntimeGraphTest, AddSuccessor_UpdatesFanoutAndFanin) {
+    int a = addTask();
+    int b = addTask();
+
+    rt->add_successor(a, b);
+
+    Task *ta = rt->get_task(a);
+    Task *tb = rt->get_task(b);
+
+    EXPECT_EQ(ta->fanout_count, 1);
+    EXPECT_EQ(ta->fanout[0], b);
+    EXPECT_EQ(tb->fanin.load(), 1);
+}
+
+// =============================================================================
+// Ready task detection
+// =============================================================================
+
+TEST_F(RuntimeGraphTest, ReadyTaskDetection) {
+    // Task 0 has no deps (ready), Task 1 depends on Task 0 (not ready)
+    int a = addTask();
+    int b = addTask();
+    rt->add_successor(a, b);
+
+    int ready[RUNTIME_MAX_TASKS];
+    int count = rt->get_initial_ready_tasks(ready);
+
+    EXPECT_EQ(count, 1);
+    EXPECT_EQ(ready[0], a);
+}
+
+// =============================================================================
+// Diamond DAG: A -> {B, C} -> D
+// =============================================================================
+
+TEST_F(RuntimeGraphTest, DiamondDAG) {
+    int a = addTask();
+    int b = addTask();
+    int c = addTask();
+    int d = addTask();
+
+    rt->add_successor(a, b);
+    rt->add_successor(a, c);
+    rt->add_successor(b, d);
+    rt->add_successor(c, d);
+
+    // Only A should be ready
+    int ready[RUNTIME_MAX_TASKS];
+    int count = rt->get_initial_ready_tasks(ready);
+    EXPECT_EQ(count, 1);
+    EXPECT_EQ(ready[0], a);
+
+    // D should have fanin=2
+    Task *td = rt->get_task(d);
+    EXPECT_EQ(td->fanin.load(), 2);
+
+    // A should have fanout=2
+    Task *ta = rt->get_task(a);
+    EXPECT_EQ(ta->fanout_count, 2);
+}
+
+// =============================================================================
+// Linear chain: A -> B -> C -> D
+// =============================================================================
+
+TEST_F(RuntimeGraphTest, LinearChain) {
+    int a = addTask();
+    int b = addTask();
+    int c = addTask();
+    int d = addTask();
+
+    rt->add_successor(a, b);
+    rt->add_successor(b, c);
+    rt->add_successor(c, d);
+
+    // Only A is ready
+    int ready[RUNTIME_MAX_TASKS];
+    int count = rt->get_initial_ready_tasks(ready);
+    EXPECT_EQ(count, 1);
+    EXPECT_EQ(ready[0], a);
+
+    // Each task has exactly fanin=1 except A
+    EXPECT_EQ(rt->get_task(a)->fanin.load(), 0);
+    EXPECT_EQ(rt->get_task(b)->fanin.load(), 1);
+    EXPECT_EQ(rt->get_task(c)->fanin.load(), 1);
+    EXPECT_EQ(rt->get_task(d)->fanin.load(), 1);
+}
+
+// =============================================================================
+// Fanout / Fanin consistency
+// =============================================================================
+
+TEST_F(RuntimeGraphTest, FanoutFaninConsistency) {
+    // Build: T0 -> {T1, T2, T3}, T1 -> T4, T2 -> T4, T3 -> T4
+    int t0 = addTask();
+    int t1 = addTask();
+    int t2 = addTask();
+    int t3 = addTask();
+    int t4 = addTask();
+
+    rt->add_successor(t0, t1);
+    rt->add_successor(t0, t2);
+    rt->add_successor(t0, t3);
+    rt->add_successor(t1, t4);
+    rt->add_successor(t2, t4);
+    rt->add_successor(t3, t4);
+
+    // Verify: total fanout references == total fanin across all tasks
+    int total_fanout = 0;
+    int total_fanin = 0;
+    for (int i = 0; i < rt->get_task_count(); i++) {
+        Task *t = rt->get_task(i);
+        total_fanout += t->fanout_count;
+        total_fanin += t->fanin.load();
+    }
+    EXPECT_EQ(total_fanout, total_fanin);
+}
+
+// =============================================================================
+// Max task limit
+// =============================================================================
+
+TEST_F(RuntimeGraphTest, MaxTaskLimit) {
+    // Fill up to RUNTIME_MAX_TASKS (this is 131072, too large to loop in test)
+    // Instead test that adding more tasks after setting next_task_id near max fails.
+    // We'll add a few tasks, then check the add_task return value logic.
+
+    // Add one task successfully
+    int id = addTask();
+    EXPECT_GE(id, 0);
+
+    // get_task with invalid ID returns nullptr
+    EXPECT_EQ(rt->get_task(-1), nullptr);
+    EXPECT_EQ(rt->get_task(RUNTIME_MAX_TASKS + 1), nullptr);
+}
+
+// =============================================================================
+// Tensor pair management
+// =============================================================================
+
+TEST_F(RuntimeGraphTest, TensorPairManagement) {
+    EXPECT_EQ(rt->get_tensor_pair_count(), 0);
+
+    char host_buf[64], dev_buf[64];
+    rt->record_tensor_pair(host_buf, dev_buf, 64);
+
+    EXPECT_EQ(rt->get_tensor_pair_count(), 1);
+
+    TensorPair *pairs = rt->get_tensor_pairs();
+    EXPECT_EQ(pairs[0].host_ptr, static_cast<void *>(host_buf));
+    EXPECT_EQ(pairs[0].dev_ptr, static_cast<void *>(dev_buf));
+    EXPECT_EQ(pairs[0].size, 64u);
+
+    rt->clear_tensor_pairs();
+    EXPECT_EQ(rt->get_tensor_pair_count(), 0);
+}
+
+// =============================================================================
+// Kernel address mapping
+// =============================================================================
+
+TEST_F(RuntimeGraphTest, FunctionBinAddrMapping) {
+    rt->set_function_bin_addr(0, 0xDEAD);
+    rt->set_function_bin_addr(5, 0xBEEF);
+
+    EXPECT_EQ(rt->get_function_bin_addr(0), 0xDEADu);
+    EXPECT_EQ(rt->get_function_bin_addr(5), 0xBEEFu);
+    EXPECT_EQ(rt->get_function_bin_addr(1), 0u);                    // Not set
+    EXPECT_EQ(rt->get_function_bin_addr(-1), 0u);                   // Invalid
+    EXPECT_EQ(rt->get_function_bin_addr(RUNTIME_MAX_FUNC_ID), 0u);  // Out of range
+}
diff --git a/tests/ut/cpp/pto2_a2a3/test_runtime_lifecycle.cpp b/tests/ut/cpp/pto2_a2a3/test_runtime_lifecycle.cpp
new file mode 100644
index 000000000..6084c2d25
--- /dev/null
+++ b/tests/ut/cpp/pto2_a2a3/test_runtime_lifecycle.cpp
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * PTO2 Runtime lifecycle UT.
+ *
+ * Covers pto2_runtime_create / _custom / _from_sm / _destroy / set_mode.
+ *
+ * Follows AAA and FIRST: no shared mutable state between tests, each test
+ * constructs its own runtime and tears it down.
+ */
+
+#include <gtest/gtest.h>
+
+#include <cstdint>
+
+#include "pto_runtime2.h"
+#include "pto_shared_memory.h"
+
+namespace {
+
+constexpr uint64_t kSmallWindow = 64;
+constexpr uint64_t kSmallHeap = 64 * 1024;
+
+// -----------------------------------------------------------------------------
+// Fixture: each test gets a fresh, isolated runtime config.
+// -----------------------------------------------------------------------------
+class RuntimeLifecycleTest : public ::testing::Test {
+protected:
+    PTO2Runtime *rt_ = nullptr;
+
+    void TearDown() override {
+        if (rt_ != nullptr) {
+            pto2_runtime_destroy(rt_);
+            rt_ = nullptr;
+        }
+    }
+};
+
+}  // namespace
+
+// ---------- Happy-path creation ----------
+
+TEST_F(RuntimeLifecycleTest, CreateCustom_ValidSizes_ReturnsInitializedRuntime) {
+    // Arrange + Act
+    rt_ = pto2_runtime_create_custom(PTO2_MODE_SIMULATE, kSmallWindow, kSmallHeap);
+
+    // Assert
+    ASSERT_NE(rt_, nullptr);
+    EXPECT_NE(rt_->ops, nullptr);
+    EXPECT_NE(rt_->sm_handle, nullptr);
+    EXPECT_NE(rt_->gm_heap, nullptr);
+    EXPECT_TRUE(rt_->gm_heap_owned);
+    EXPECT_EQ(rt_->mode, PTO2_MODE_SIMULATE);
+    EXPECT_EQ(rt_->gm_heap_size, kSmallHeap * PTO2_MAX_RING_DEPTH);
+}
+
+TEST_F(RuntimeLifecycleTest, CreateCustom_ConnectsOrchestratorToScheduler) {
+    rt_ = pto2_runtime_create_custom(PTO2_MODE_EXECUTE, kSmallWindow, kSmallHeap);
+
+    ASSERT_NE(rt_, nullptr);
+    // In simulated mode the orchestrator must hold a pointer to the scheduler.
+    EXPECT_EQ(rt_->orchestrator.scheduler, &rt_->scheduler);
+}
+
+TEST_F(RuntimeLifecycleTest, CreateDefault_UsesDefaultSizes) {
+    // create() is a thin wrapper around create_custom with PTO2_TASK_WINDOW_SIZE / PTO2_HEAP_SIZE.
+    // Use GRAPH_ONLY to avoid executor threads.  We don't allocate the full
+    // 256MB heap in this path -- keep the assertion restricted to mode.
+    rt_ = pto2_runtime_create(PTO2_MODE_GRAPH_ONLY);
+    ASSERT_NE(rt_, nullptr);
+    EXPECT_EQ(rt_->mode, PTO2_MODE_GRAPH_ONLY);
+}
+
+// ---------- From-SM creation ----------
+
+TEST_F(RuntimeLifecycleTest, CreateFromSM_NullHandle_ReturnsNull) {
+    // Act
+    PTO2Runtime *rt = pto2_runtime_create_from_sm(PTO2_MODE_SIMULATE, nullptr, nullptr, 0);
+
+    // Assert
+    EXPECT_EQ(rt, nullptr);
+}
+
+TEST_F(RuntimeLifecycleTest, CreateFromSM_RecordsCallerBuffers) {
+    // Arrange: caller-allocated sm + gm_heap.
+    PTO2SharedMemoryHandle *sm = pto2_sm_create(kSmallWindow, kSmallHeap);
+    ASSERT_NE(sm, nullptr);
+    uint8_t *heap = static_cast<uint8_t *>(std::calloc(PTO2_MAX_RING_DEPTH, kSmallHeap));
+    ASSERT_NE(heap, nullptr);
+
+    // Act
+    rt_ = pto2_runtime_create_from_sm(PTO2_MODE_EXECUTE, sm, heap, kSmallHeap);
+
+    // Assert: the returned runtime must NOT claim ownership of the gm_heap.
+    ASSERT_NE(rt_, nullptr);
+    EXPECT_EQ(rt_->sm_handle, sm);
+    EXPECT_EQ(rt_->gm_heap, heap);
+    EXPECT_FALSE(rt_->gm_heap_owned);
+
+    // Cleanup: pto2_runtime_destroy consumes sm via pto2_sm_destroy (observed
+    // behavior, see pto_runtime2.cpp:339), so only free the gm_heap here.
+    pto2_runtime_destroy(rt_);
+    rt_ = nullptr;
+    std::free(heap);
+}
+
+// ---------- Destroy ----------
+
+TEST_F(RuntimeLifecycleTest, Destroy_NullRuntime_NoCrash) {
+    // Documented contract: destroy(nullptr) is a no-op.
+    pto2_runtime_destroy(nullptr);
+    SUCCEED();
+}
+
+TEST_F(RuntimeLifecycleTest, Destroy_ReleasesOwnedHeap) {
+    rt_ = pto2_runtime_create_custom(PTO2_MODE_SIMULATE, kSmallWindow, kSmallHeap);
+    ASSERT_NE(rt_, nullptr);
+    // Act: explicitly destroy and null out so TearDown doesn't double-free.
+    pto2_runtime_destroy(rt_);
+    rt_ = nullptr;
+    // Assert: reaching here without asan/ubsan complaint is the test (leak-free).
+    SUCCEED();
+}
+
+// ---------- set_mode ----------
+
+TEST_F(RuntimeLifecycleTest, SetMode_UpdatesField) {
+    rt_ = pto2_runtime_create_custom(PTO2_MODE_EXECUTE, kSmallWindow, kSmallHeap);
+    ASSERT_NE(rt_, nullptr);
+    ASSERT_EQ(rt_->mode, PTO2_MODE_EXECUTE);
+
+    // Act
+    pto2_runtime_set_mode(rt_, PTO2_MODE_GRAPH_ONLY);
+
+    // Assert
+    EXPECT_EQ(rt_->mode, PTO2_MODE_GRAPH_ONLY);
+}
+
+TEST_F(RuntimeLifecycleTest, SetMode_NullRuntime_NoCrash) {
+    // Contract: defensive null check, mirrors destroy.
+    pto2_runtime_set_mode(nullptr, PTO2_MODE_SIMULATE);
+    SUCCEED();
+}
+
+// ---------- Ops table wiring ----------
+
+TEST_F(RuntimeLifecycleTest, OpsTable_AllFunctionPointersPopulated) {
+    rt_ = pto2_runtime_create_custom(PTO2_MODE_SIMULATE, kSmallWindow, kSmallHeap);
+    ASSERT_NE(rt_, nullptr);
+    const PTO2RuntimeOps *ops = rt_->ops;
+    ASSERT_NE(ops, nullptr);
+
+    // Hot-path ops called by the orchestration .so -- must never be null.
+    EXPECT_NE(ops->submit_task, nullptr);
+    EXPECT_NE(ops->alloc_tensors, nullptr);
+    EXPECT_NE(ops->scope_begin, nullptr);
+    EXPECT_NE(ops->scope_end, nullptr);
+    EXPECT_NE(ops->orchestration_done, nullptr);
+    EXPECT_NE(ops->is_fatal, nullptr);
+    EXPECT_NE(ops->report_fatal, nullptr);
+    EXPECT_NE(ops->get_tensor_data, nullptr);
+    EXPECT_NE(ops->set_tensor_data, nullptr);
+}
+
+TEST_F(RuntimeLifecycleTest, IsFatal_FreshRuntime_ReturnsFalse) {
+    rt_ = pto2_runtime_create_custom(PTO2_MODE_SIMULATE, kSmallWindow, kSmallHeap);
+    ASSERT_NE(rt_, nullptr);
+    EXPECT_FALSE(rt_->ops->is_fatal(rt_));
+}
+
+TEST_F(RuntimeLifecycleTest, ReportFatal_SetsFatalFlag) {
+    rt_ = pto2_runtime_create_custom(PTO2_MODE_SIMULATE, kSmallWindow, kSmallHeap);
+    ASSERT_NE(rt_, nullptr);
+
+    // Act
+    rt_->ops->report_fatal(rt_, PTO2_ERROR_EXPLICIT_ORCH_FATAL, "UT", "%s", "forced");
+
+    // Assert
+    EXPECT_TRUE(rt_->ops->is_fatal(rt_));
+}
diff --git a/tests/ut/cpp/pto2_a2a3/test_runtime_status.cpp b/tests/ut/cpp/pto2_a2a3/test_runtime_status.cpp
new file mode 100644
index 000000000..92c970e0f
--- /dev/null
+++ b/tests/ut/cpp/pto2_a2a3/test_runtime_status.cpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * UT for pto2_runtime_status_from_error_codes (pto_runtime_status.h).
+ *
+ * The helper merges an orchestrator code (1-99) and a scheduler code (100+)
+ * into a single negative AICPU return code. Orchestrator code wins when both
+ * are non-zero; positive codes are negated; already-negative codes pass
+ * through unchanged.
+ *
+ * Pure header-only function -- no runtime linkage required.
+ */
+
+#include <gtest/gtest.h>
+
+#include <cstdint>
+
+#include "pto_runtime_status.h"
+
+// ---------- happy path ----------
+
+TEST(RuntimeStatus, BothZero_ReturnsZero) {
+    EXPECT_EQ(pto2_runtime_status_from_error_codes(PTO2_ERROR_NONE, PTO2_ERROR_NONE), 0);
+}
+
+// ---------- single-channel ----------
+
+TEST(RuntimeStatus, OrchOnly_NegatesPositiveCode) {
+    EXPECT_EQ(
+        pto2_runtime_status_from_error_codes(PTO2_ERROR_SCOPE_DEADLOCK, PTO2_ERROR_NONE), -PTO2_ERROR_SCOPE_DEADLOCK
+    );
+}
+
+TEST(RuntimeStatus, SchedOnly_NegatesPositiveCode) {
+    EXPECT_EQ(
+        pto2_runtime_status_from_error_codes(PTO2_ERROR_NONE, PTO2_ERROR_SCHEDULER_TIMEOUT),
+        -PTO2_ERROR_SCHEDULER_TIMEOUT
+    );
+}
+
+// ---------- precedence: orchestrator wins ----------
+
+TEST(RuntimeStatus, BothNonZero_OrchTakesPrecedence) {
+    int32_t result = pto2_runtime_status_from_error_codes(PTO2_ERROR_INVALID_ARGS, PTO2_ERROR_SCHEDULER_TIMEOUT);
+    EXPECT_EQ(result, -PTO2_ERROR_INVALID_ARGS);
+}
+
+// ---------- already-negative passthrough (idempotency) ----------
+
+TEST(RuntimeStatus, NegativeOrchCode_PassesThrough) {
+    EXPECT_EQ(pto2_runtime_status_from_error_codes(-7, PTO2_ERROR_NONE), -7);
+}
+
+TEST(RuntimeStatus, NegativeSchedCode_PassesThrough) {
+    EXPECT_EQ(pto2_runtime_status_from_error_codes(PTO2_ERROR_NONE, -101), -101);
+}
+
+// ---------- equivalence-class coverage of every defined code ----------
+
+TEST(RuntimeStatus, AllOrchestratorCodes_AreNegated) {
+    const int32_t codes[] = {
+        PTO2_ERROR_SCOPE_DEADLOCK,
+        PTO2_ERROR_HEAP_RING_DEADLOCK,
+        PTO2_ERROR_FLOW_CONTROL_DEADLOCK,
+        PTO2_ERROR_DEP_POOL_OVERFLOW,
+        PTO2_ERROR_INVALID_ARGS,
+        PTO2_ERROR_DEPENDENCY_OVERFLOW,
+        PTO2_ERROR_REQUIRE_SYNC_START_INVALID,
+        PTO2_ERROR_TENSOR_WAIT_TIMEOUT,
+        PTO2_ERROR_EXPLICIT_ORCH_FATAL,
+    };
+    for (int32_t code : codes) {
+        SCOPED_TRACE(testing::Message() << "orch_code=" << code);
+        EXPECT_EQ(pto2_runtime_status_from_error_codes(code, PTO2_ERROR_NONE), -code);
+    }
+}
+
+// ---------- contract guard: PTO2_ERROR_NONE is the only zero ----------
+
+TEST(RuntimeStatus, NoneIsZero) { EXPECT_EQ(PTO2_ERROR_NONE, 0); }
+
+TEST(RuntimeStatus, OrchAndSchedRangesDoNotOverlap) {
+    // Orchestrator codes occupy 1..99; scheduler codes occupy 100+.
+    EXPECT_LT(PTO2_ERROR_EXPLICIT_ORCH_FATAL, 100);
+    EXPECT_GE(PTO2_ERROR_SCHEDULER_TIMEOUT, 100);
+}
diff --git a/tests/ut/cpp/pto2_a2a3/test_scheduler_edge.cpp b/tests/ut/cpp/pto2_a2a3/test_scheduler_edge.cpp
new file mode 100644
index 000000000..78dfa7206
--- /dev/null
+++ b/tests/ut/cpp/pto2_a2a3/test_scheduler_edge.cpp
@@ -0,0 +1,835 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Edge-case tests for ReadyQueue, SharedMemory, and TaskState.
+ *
+ * ============================================================================
+ * DESIGN CONTRACTS -- PTO2ReadyQueue (Vyukov MPMC)
+ * ============================================================================
+ *
+ * DC-1 (sequence wrap): The sequence counter is int64_t.  After 2^63
+ *   push/pop operations it wraps; comparisons still work because both
+ *   positions wrap identically (two's complement).  Practically
+ *   unreachable; `-ftrapv` would convert it into a crash.
+ *
+ * DC-2 (pop fast-path): pop() checks `enqueue_pos == dequeue_pos` as an
+ *   early-empty hint.  A push between the hint and the CAS can race; this
+ *   is the standard TOCTOU of Vyukov MPMC queues and acceptable.
+ *
+ * DC-3 (push returns false near full): All producers that see a full slot
+ *   return false simultaneously even if a pop happens right after.
+ *   Acceptable back-pressure, not a defect.
+ *
+ * DC-9 (size() relaxed ordering): size() reads both positions with
+ *   memory_order_relaxed and is a hint, not a point-in-time snapshot.
+ *   If a stale read produces d > e the guard returns 0.
+ *
+ * ============================================================================
+ * DESIGN CONTRACTS -- Scheduler
+ * ============================================================================
+ *
+ * DC-10 (release_fanin_and_check_ready CAS): the non-profiling overload
+ *   does NOT CAS task_state before pushing.  The profiling overload CASes
+ *   purely so the operation can be counted; dispatch correctness in both
+ *   builds derives from `fanin_refcount.fetch_add` -- only the thread that
+ *   observes `new_refcount == fanin_count` pushes.  NOT a bug.
+ *
+ * DC-11 (LocalReadyBuffer LIFO dispatch): try_push appends at count++, pop
+ *   returns slot_states[--count].  LIFO reversal is intentional for
+ *   cache-locality when a producer immediately dispatches its fanout.
+ *
+ * DC-12 (on_subtask_complete double-completion): fetch_add is idempotent
+ *   on a pure counter; a repeat call returns false because prev+1 !=
+ *   total_required_subtasks.  No detection of double-call as a logic error
+ *   -- caller contract.
+ *
+ * DC-13 (advance_ring_pointers, FORMERLY a null-deref candidate):
+ *   HISTORICAL -- advance_ring_pointers no longer touches slot.task at all.
+ *   It reads task_state == PTO2_TASK_CONSUMED only (see pto_scheduler.h).
+ *   Heap reclamation was moved to PTO2TaskAllocator::update_heap_tail.
+ *
+ * ============================================================================
+ * DESIGN CONTRACTS -- SharedMemory
+ * ============================================================================
+ *
+ * DC-4 (pto2_sm_validate): checks `top > heap_size`.  top == heap_size is
+ *   a legitimate "filled exactly to end" state, so strict > is correct.
+ *
+ * BUG-CANDIDATE-5 (size calculation with task_window_size=0): if the
+ *   runtime ever called `pto2_sm_calculate_size()` with 0, all ring
+ *   descriptors/payloads would alias the same address.  The current entry
+ *   path is pto2_sm_create, which is called only with valid sizes, but
+ *   there is no explicit guard.  Real defect -- pto2_sm_create should
+ *   reject task_window_size==0.  Tests below pin this behavior.
+ *
+ * BUG-CANDIDATE-6 (flow control heap_top validation): `validate()` does
+ *   not verify `heap_top <= heap_size`.  After a corruption or an
+ *   unbounded caller, heap_top could exceed heap_size without detection.
+ *   Real defect -- validate should check both bounds.
+ *
+ * ============================================================================
+ * DESIGN CONTRACTS -- TaskState
+ * ============================================================================
+ *
+ * EDGE-1: CAS on task_state with memory_order_relaxed could reorder with
+ *   subsequent reads of fanin_refcount.  The actual scheduler code uses
+ *   acquire/release on task_state.
+ *
+ * EDGE-2: completed_subtasks uses fetch_add(1) with acq_rel ordering; the
+ *   thread that observes (prev+1) == total is the sole completer.
+ */
+
+#include <gtest/gtest.h>
+#include <thread>
+#include <vector>
+#include <atomic>
+#include <algorithm>
+#include <numeric>
+#include <set>
+#include <cstring>
+#include "pto_scheduler.h"
+#include "pto_shared_memory.h"
+#include "../test_helpers.h"
+
+// =============================================================================
+// ReadyQueue edge cases
+// =============================================================================
+class ReadyQueueEdgeTest : public ::testing::Test {
+protected:
+    static constexpr uint64_t QUEUE_CAP = 8;  // Small for edge testing
+    PTO2ReadyQueueSlot slots[8]{};
+    PTO2ReadyQueue queue{};
+    PTO2TaskSlotState dummy[8]{};
+
+    void SetUp() override { test_ready_queue_init(&queue, slots, QUEUE_CAP); }
+};
+
+// ---------------------------------------------------------------------------
+// Push and pop interleaving: push(A), pop() -> A, push(B), pop() -> B
+// Ensures sequence numbers are correctly advanced after each operation.
+// ---------------------------------------------------------------------------
+TEST_F(ReadyQueueEdgeTest, InterleavedPushPop) {
+    for (int i = 0; i < 20; i++) {
+        EXPECT_TRUE(queue.push(&dummy[0]));
+        PTO2TaskSlotState *s = queue.pop();
+        EXPECT_EQ(s, &dummy[0]);
+    }
+    // After 20 interleaved push/pop, queue should be empty
+    EXPECT_EQ(queue.size(), 0u);
+    EXPECT_EQ(queue.pop(), nullptr);
+}
+
+// ---------------------------------------------------------------------------
+// Exactly fill queue, then pop all -- boundary at capacity
+// ---------------------------------------------------------------------------
+TEST_F(ReadyQueueEdgeTest, ExactCapacityFillDrain) {
+    // Push exactly capacity items
+    int pushed = 0;
+    for (uint64_t i = 0; i < QUEUE_CAP; i++) {
+        if (queue.push(&dummy[i % 8])) pushed++;
+        else break;
+    }
+    // Vyukov MPMC with capacity N can hold N-1 items (one slot is always empty)
+    // OR exactly N depending on implementation.
+    // The actual implementation checks `sequence == pos` which allows N items.
+    EXPECT_GE(pushed, (int)(QUEUE_CAP - 1));
+
+    // Pop all
+    for (int i = 0; i < pushed; i++) {
+        EXPECT_NE(queue.pop(), nullptr);
+    }
+    EXPECT_EQ(queue.pop(), nullptr);
+}
+
+// ---------------------------------------------------------------------------
+// Push to full queue: must return false
+// ---------------------------------------------------------------------------
+TEST_F(ReadyQueueEdgeTest, PushToFullQueue) {
+    // Fill the queue
+    int pushed = 0;
+    while (queue.push(&dummy[0]))
+        pushed++;
+
+    // Queue is now full
+    EXPECT_FALSE(queue.push(&dummy[1])) << "Push to full queue returns false";
+
+    // Pop one, then push should succeed again
+    EXPECT_NE(queue.pop(), nullptr);
+    EXPECT_TRUE(queue.push(&dummy[1])) << "Push succeeds after pop from full queue";
+}
+
+// ---------------------------------------------------------------------------
+// BUG-CANDIDATE-9: size() with relaxed ordering can be stale
+// ---------------------------------------------------------------------------
+TEST_F(ReadyQueueEdgeTest, SizeRelaxedOrdering) {
+    // Push 3 items
+    queue.push(&dummy[0]);
+    queue.push(&dummy[1]);
+    queue.push(&dummy[2]);
+
+    // In single-threaded context, size should be exact
+    EXPECT_EQ(queue.size(), 3u);
+
+    // Pop 1
+    queue.pop();
+    EXPECT_EQ(queue.size(), 2u);
+
+    // Pop remaining
+    queue.pop();
+    queue.pop();
+    EXPECT_EQ(queue.size(), 0u);
+}
+
+// ---------------------------------------------------------------------------
+// size() guard: after many push/pop cycles, size never goes negative
+// ---------------------------------------------------------------------------
+TEST_F(ReadyQueueEdgeTest, SizeNeverNegative) {
+    // After many push/pop cycles, verify size() always returns a sane value
+    for (int i = 0; i < 100; i++) {
+        ASSERT_TRUE(queue.push(&dummy[0]));
+        queue.pop();
+    }
+    // Queue is empty -- size must be 0, not negative or wrapped
+    EXPECT_EQ(queue.size(), 0u) << "size() returns 0 after balanced push/pop cycles";
+}
+
+// ---------------------------------------------------------------------------
+// FIFO ordering: items come out in the order they were pushed
+// ---------------------------------------------------------------------------
+TEST_F(ReadyQueueEdgeTest, FIFOOrdering) {
+    for (int i = 0; i < 5; i++) {
+        ASSERT_TRUE(queue.push(&dummy[i]));
+    }
+
+    for (int i = 0; i < 5; i++) {
+        PTO2TaskSlotState *s = queue.pop();
+        ASSERT_NE(s, nullptr);
+        EXPECT_EQ(s, &dummy[i]) << "FIFO: item " << i << " should come out in order";
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Concurrent stress: many producers, many consumers, large volume
+// ---------------------------------------------------------------------------
+TEST_F(ReadyQueueEdgeTest, HighContentionStress) {
+    // Use a larger queue for stress testing
+    static constexpr uint64_t BIG_CAP = 256;
+    PTO2ReadyQueueSlot big_slots[BIG_CAP];
+    PTO2ReadyQueue big_queue{};
+    test_ready_queue_init(&big_queue, big_slots, BIG_CAP);
+
+    constexpr int N = 5000;
+    constexpr int P = 4, C = 4;
+    std::vector<PTO2TaskSlotState> items(N);
+    std::atomic<int> produced{0}, consumed{0};
+
+    auto producer = [&](int id) {
+        for (int i = id; i < N; i += P) {
+            while (!big_queue.push(&items[i])) {}
+            produced++;
+        }
+    };
+    auto consumer = [&]() {
+        while (consumed.load() < N) {
+            PTO2TaskSlotState *s = big_queue.pop();
+            if (s) consumed++;
+        }
+    };
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i < P; i++)
+        threads.emplace_back(producer, i);
+    for (int i = 0; i < C; i++)
+        threads.emplace_back(consumer);
+    for (auto &t : threads)
+        t.join();
+
+    EXPECT_EQ(produced.load(), N);
+    EXPECT_EQ(consumed.load(), N);
+}
+
+// ---------------------------------------------------------------------------
+// Concurrent stress: verify no duplicates consumed
+// Uses pointer identity (address comparison) instead of repurposing
+// production struct fields as test tags.
+// ---------------------------------------------------------------------------
+TEST_F(ReadyQueueEdgeTest, NoDuplicateConsumption) {
+    static constexpr uint64_t BIG_CAP = 128;
+    PTO2ReadyQueueSlot big_slots[BIG_CAP];
+    PTO2ReadyQueue big_queue{};
+    test_ready_queue_init(&big_queue, big_slots, BIG_CAP);
+
+    constexpr int N = 1000;
+    std::vector<PTO2TaskSlotState> items(N);
+
+    // Track consumed items by pointer address in a separate array
+    std::vector<int> consumed_count(N, 0);
+
+    auto item_index = [&](PTO2TaskSlotState *s) -> int {
+        return static_cast<int>(s - items.data());
+    };
+
+    // Push all items
+    for (int i = 0; i < N; i++) {
+        while (!big_queue.push(&items[i])) {
+            // Drain some if full
+            PTO2TaskSlotState *s = big_queue.pop();
+            if (s) consumed_count[item_index(s)]++;
+        }
+    }
+
+    // Pop remaining
+    while (true) {
+        PTO2TaskSlotState *s = big_queue.pop();
+        if (!s) break;
+        consumed_count[item_index(s)]++;
+    }
+
+    // Verify each item consumed exactly once
+    int total_consumed = 0;
+    for (int i = 0; i < N; i++) {
+        EXPECT_EQ(consumed_count[i], 1) << "Item " << i << " consumed " << consumed_count[i] << " times";
+        total_consumed += consumed_count[i];
+    }
+    EXPECT_EQ(total_consumed, N) << "Each item should be consumed exactly once";
+}
+
+// ---------------------------------------------------------------------------
+// Pop from empty queue multiple times -- must always return nullptr
+// ---------------------------------------------------------------------------
+TEST_F(ReadyQueueEdgeTest, RepeatedEmptyPop) {
+    for (int i = 0; i < 100; i++) {
+        EXPECT_EQ(queue.pop(), nullptr);
+    }
+    // After 100 empty pops, size should still be 0
+    EXPECT_EQ(queue.size(), 0u);
+}
+
+// ---------------------------------------------------------------------------
+// Push-pop cycles beyond sequence counter wrap (small queue, many cycles)
+// ---------------------------------------------------------------------------
+TEST_F(ReadyQueueEdgeTest, ManyPushPopCycles) {
+    // With capacity 8, sequence numbers grow by 1 per push/pop.
+    // After many cycles, sequences grow large but should remain correct.
+    for (int i = 0; i < 10000; i++) {
+        ASSERT_TRUE(queue.push(&dummy[0]));
+        PTO2TaskSlotState *s = queue.pop();
+        ASSERT_NE(s, nullptr);
+        EXPECT_EQ(s, &dummy[0]);
+    }
+
+    // Queue should be empty and still functional
+    EXPECT_EQ(queue.size(), 0u);
+    EXPECT_TRUE(queue.push(&dummy[1]));
+    EXPECT_EQ(queue.pop(), &dummy[1]);
+}
+
+// =============================================================================
+// LocalReadyBuffer edge cases
+// =============================================================================
+
+// ---------------------------------------------------------------------------
+// BUG-CANDIDATE-11: LocalReadyBuffer LIFO dispatch order
+// push adds at [count++], pop returns [--count].
+// Last pushed = first popped = LIFO, not FIFO.
+// ---------------------------------------------------------------------------
+TEST(LocalReadyBufferTest, LIFODispatchOrder) {
+    PTO2TaskSlotState *storage[8]{};
+    PTO2LocalReadyBuffer buf;
+    buf.reset(storage, 8);
+
+    PTO2TaskSlotState items[4]{};
+    // Push in order: 0, 1, 2, 3
+    for (int i = 0; i < 4; i++) {
+        ASSERT_TRUE(buf.try_push(&items[i]));
+    }
+
+    // Pop order should be LIFO: 3, 2, 1, 0 (reverse of push)
+    // Use pointer identity to verify ordering
+    for (int i = 3; i >= 0; i--) {
+        PTO2TaskSlotState *s = buf.pop();
+        ASSERT_NE(s, nullptr);
+        EXPECT_EQ(s, &items[i]) << "LocalReadyBuffer pops in LIFO order (priority reversed)";
+    }
+
+    // This means if tasks A, B, C, D become ready (in dependency order),
+    // they are dispatched as D, C, B, A -- reverse of optimal order.
+    EXPECT_EQ(buf.pop(), nullptr) << "Empty after draining";
+}
+
+// ---------------------------------------------------------------------------
+// LocalReadyBuffer overflow: try_push returns false at capacity
+// ---------------------------------------------------------------------------
+TEST(LocalReadyBufferTest, OverflowBehavior) {
+    PTO2TaskSlotState *storage[4]{};
+    PTO2LocalReadyBuffer buf;
+    buf.reset(storage, 4);
+
+    PTO2TaskSlotState items[6]{};
+    int pushed = 0;
+    for (int i = 0; i < 6; i++) {
+        if (buf.try_push(&items[i])) pushed++;
+    }
+
+    EXPECT_EQ(pushed, 4) << "Only 4 items fit in capacity-4 buffer";
+    EXPECT_FALSE(buf.try_push(&items[5])) << "5th push fails";
+}
+
+// ---------------------------------------------------------------------------
+// LocalReadyBuffer with nullptr backing: all pushes fail
+// ---------------------------------------------------------------------------
+TEST(LocalReadyBufferTest, NullBackingBuffer) {
+    PTO2LocalReadyBuffer buf;
+    buf.reset(nullptr, 0);
+
+    PTO2TaskSlotState item{};
+    EXPECT_FALSE(buf.try_push(&item)) << "Push fails with null backing";
+    EXPECT_EQ(buf.pop(), nullptr) << "Pop returns null with null backing";
+}
+
+// ---------------------------------------------------------------------------
+// LocalReadyBuffer reset clears state
+// ---------------------------------------------------------------------------
+TEST(LocalReadyBufferTest, ResetClearsState) {
+    PTO2TaskSlotState *storage[8]{};
+    PTO2LocalReadyBuffer buf;
+    buf.reset(storage, 8);
+
+    PTO2TaskSlotState item{};
+    buf.try_push(&item);
+    buf.try_push(&item);
+
+    // After reset, buffer should behave as empty
+    buf.reset(storage, 8);
+    EXPECT_EQ(buf.pop(), nullptr) << "Buffer is empty after reset";
+
+    // Should accept pushes again up to capacity
+    for (int i = 0; i < 8; i++) {
+        EXPECT_TRUE(buf.try_push(&item));
+    }
+    EXPECT_FALSE(buf.try_push(&item)) << "Full after pushing capacity items";
+}
+
+// =============================================================================
+// SharedMemory edge cases
+// =============================================================================
+
+// ---------------------------------------------------------------------------
+// BUG-CANDIDATE-5: Zero window size
+// ---------------------------------------------------------------------------
+TEST(SharedMemEdgeTest, ZeroWindowSize) {
+    uint64_t size = pto2_sm_calculate_size(0);
+    // With window=0, only header is counted
+    uint64_t header_size = PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
+    EXPECT_EQ(size, header_size);
+
+    PTO2SharedMemoryHandle *h = pto2_sm_create(0, 4096);
+    if (h) {
+        // All ring descriptors should point to the same location (after header)
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH - 1; r++) {
+            EXPECT_EQ(h->header->rings[r].task_descriptors, h->header->rings[r + 1].task_descriptors)
+                << "Zero window: all rings' descriptor pointers collapse to same address";
+        }
+        pto2_sm_destroy(h);
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Validate detects corrupted flow control
+// ---------------------------------------------------------------------------
+TEST(SharedMemEdgeTest, ValidateDetectsCorruption) {
+    PTO2SharedMemoryHandle *h = pto2_sm_create(256, 4096);
+    ASSERT_NE(h, nullptr);
+    EXPECT_TRUE(pto2_sm_validate(h));
+
+    // Corrupt: set current_task_index to negative value
+    h->header->rings[0].fc.current_task_index.store(-1);
+    EXPECT_FALSE(pto2_sm_validate(h));
+
+    pto2_sm_destroy(h);
+}
+
+// ---------------------------------------------------------------------------
+// Validate with null handle
+// ---------------------------------------------------------------------------
+TEST(SharedMemEdgeTest, ValidateNullHandle) { EXPECT_FALSE(pto2_sm_validate(nullptr)); }
+
+// ---------------------------------------------------------------------------
+// Create from undersized buffer
+// ---------------------------------------------------------------------------
+TEST(SharedMemEdgeTest, CreateFromUndersizedBuffer) {
+    char buf[64]{};
+    PTO2SharedMemoryHandle *h = pto2_sm_create_from_buffer(buf, 64, 256, 4096);
+    EXPECT_EQ(h, nullptr) << "Undersized buffer should fail";
+}
+
+// ---------------------------------------------------------------------------
+// Per-ring different window sizes via pto2_sm_calculate_size_per_ring
+// ---------------------------------------------------------------------------
+TEST(SharedMemEdgeTest, PerRingDifferentSizes) {
+    uint64_t ws[PTO2_MAX_RING_DEPTH] = {128, 256, 512, 1024};
+    uint64_t size = pto2_sm_calculate_size_per_ring(ws);
+
+    // Size should be larger than uniform 128
+    uint64_t uniform_size = pto2_sm_calculate_size(128);
+    EXPECT_GT(size, uniform_size);
+}
+
+// ---------------------------------------------------------------------------
+// Shared memory layout: descriptor and payload regions don't overlap
+// ---------------------------------------------------------------------------
+TEST(SharedMemEdgeTest, RegionsNonOverlapping) {
+    PTO2SharedMemoryHandle *h = pto2_sm_create(64, 4096);
+    ASSERT_NE(h, nullptr);
+
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        uintptr_t desc_start = (uintptr_t)h->header->rings[r].task_descriptors;
+        uintptr_t desc_end = desc_start + 64 * sizeof(PTO2TaskDescriptor);
+        uintptr_t payload_start = (uintptr_t)h->header->rings[r].task_payloads;
+
+        // Payloads should start at or after descriptors end
+        EXPECT_GE(payload_start, desc_end) << "Ring " << r << ": payload region should not overlap descriptors";
+    }
+
+    // Adjacent rings should not overlap
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH - 1; r++) {
+        uintptr_t this_payload_end = (uintptr_t)h->header->rings[r].task_payloads + 64 * sizeof(PTO2TaskPayload);
+        uintptr_t next_desc_start = (uintptr_t)h->header->rings[r + 1].task_descriptors;
+        EXPECT_GE(next_desc_start, this_payload_end) << "Ring " << r << " and " << (r + 1) << " should not overlap";
+    }
+
+    pto2_sm_destroy(h);
+}
+
+// ---------------------------------------------------------------------------
+// Shared memory header alignment
+// ---------------------------------------------------------------------------
+TEST(SharedMemEdgeTest, HeaderAlignment) {
+    PTO2SharedMemoryHandle *h = pto2_sm_create(256, 4096);
+    ASSERT_NE(h, nullptr);
+
+    uintptr_t header_addr = (uintptr_t)h->header;
+    EXPECT_EQ(header_addr % PTO2_ALIGN_SIZE, 0u) << "Header must be cache-line aligned";
+
+    pto2_sm_destroy(h);
+}
+
+// ---------------------------------------------------------------------------
+// Flow control init state
+// ---------------------------------------------------------------------------
+TEST(SharedMemEdgeTest, FlowControlInitState) {
+    PTO2SharedMemoryHandle *h = pto2_sm_create(256, 4096);
+    ASSERT_NE(h, nullptr);
+
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        auto &fc = h->header->rings[r].fc;
+        EXPECT_EQ(fc.current_task_index.load(), 0) << "Ring " << r << " current_task_index should init to 0";
+        EXPECT_EQ(fc.last_task_alive.load(), 0) << "Ring " << r << " last_task_alive should init to 0";
+    }
+
+    pto2_sm_destroy(h);
+}
+
+// =============================================================================
+// TaskState edge cases
+// =============================================================================
+
+// ---------------------------------------------------------------------------
+// DC-14 (design contract): Non-profiling release_fanin skips task_state CAS.
+//
+// The non-profiling release_fanin_and_check_ready() intentionally does NOT
+// CAS(PENDING -> READY).  Readiness is determined solely by fanin_refcount
+// reaching fanin_count -- the atomic fetch_add guarantees exactly one thread
+// sees the final count and pushes to the ready queue.  The profiling overload
+// adds the CAS only to count atomic operations.  No consumer inspects
+// task_state == READY for dispatch; it is metadata for profiling only.
+//
+// This test anchors the design: task_state stays PENDING after the
+// non-profiling ready path, confirming the CAS is profiling-only.
+// ---------------------------------------------------------------------------
+TEST(TaskStateEdgeTest, NonProfilingReadyPath_TaskStateStaysPending) {
+    PTO2TaskSlotState slot{};
+    slot.task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
+    slot.fanin_count = 1;
+    slot.fanin_refcount.store(0, std::memory_order_relaxed);
+    slot.active_mask = PTO2_SUBTASK_MASK_AIC;
+
+    // Simulate non-profiling release_fanin_and_check_ready:
+    int32_t new_refcount = slot.fanin_refcount.fetch_add(1, std::memory_order_acq_rel) + 1;
+    bool ready = (new_refcount == slot.fanin_count);
+    ASSERT_TRUE(ready) << "Task should be detected as ready via refcount";
+
+    // task_state remains PENDING -- this is correct by design.
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_PENDING)
+        << "Non-profiling path intentionally does not transition task_state to READY";
+}
+
+// ---------------------------------------------------------------------------
+// EDGE-2: Simultaneous subtask completion -- verify exactly one completer
+// Uses the current completed_subtasks counter model (not deprecated done_mask).
+// ---------------------------------------------------------------------------
+TEST(TaskStateEdgeTest, SimultaneousSubtaskCompletion) {
+    constexpr int ROUNDS = 1000;
+
+    for (int round = 0; round < ROUNDS; round++) {
+        PTO2TaskSlotState slot{};
+        slot.active_mask = PTO2_SUBTASK_MASK_AIC | PTO2_SUBTASK_MASK_AIV0;
+        slot.total_required_subtasks = 2;  // 1 block * 2 active subtasks
+        slot.completed_subtasks.store(0);
+        std::atomic<int> completers{0};
+
+        auto complete_subtask = [&]() {
+            int16_t prev = slot.completed_subtasks.fetch_add(1, std::memory_order_acq_rel);
+            if ((prev + 1) == slot.total_required_subtasks) {
+                completers++;
+            }
+        };
+
+        std::thread t1(complete_subtask);
+        std::thread t2(complete_subtask);
+        t1.join();
+        t2.join();
+
+        // Exactly ONE thread should see full completion
+        EXPECT_EQ(completers.load(), 1) << "Round " << round << ": exactly 1 thread should trigger completion";
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Double subtask completion (same core completes twice)
+// With the counter model, a double-completion increments the counter twice,
+// potentially reaching total_required_subtasks prematurely -- a real bug risk
+// that the bitmask model was immune to (fetch_or is idempotent for same bit).
+// ---------------------------------------------------------------------------
+TEST(TaskStateEdgeTest, DoubleSubtaskCompletion) {
+    PTO2TaskSlotState slot{};
+    slot.active_mask = PTO2_SUBTASK_MASK_AIC | PTO2_SUBTASK_MASK_AIV0;
+    slot.total_required_subtasks = 2;
+    slot.completed_subtasks.store(0);
+
+    // Complete AIC subtask
+    int16_t prev1 = slot.completed_subtasks.fetch_add(1, std::memory_order_acq_rel);
+    bool first_complete = ((prev1 + 1) == slot.total_required_subtasks);
+    EXPECT_FALSE(first_complete) << "Single completion doesn't complete the task";
+    EXPECT_EQ(prev1, 0);
+
+    // Complete AIC AGAIN (double-completion -- logic error)
+    // With counter model, this incorrectly reaches total_required_subtasks
+    int16_t prev2 = slot.completed_subtasks.fetch_add(1, std::memory_order_acq_rel);
+    bool second_complete = ((prev2 + 1) == slot.total_required_subtasks);
+    EXPECT_TRUE(second_complete) << "Counter model: double-completion of same core falsely triggers completion. "
+                                    "Unlike the old bitmask model, the counter cannot detect duplicate completions.";
+}
+
+// ---------------------------------------------------------------------------
+// Three subtasks: AIC + AIV0 + AIV1 (counter model)
+// ---------------------------------------------------------------------------
+TEST(TaskStateEdgeTest, ThreeSubtaskCompletion) {
+    constexpr int ROUNDS = 500;
+
+    for (int round = 0; round < ROUNDS; round++) {
+        PTO2TaskSlotState slot{};
+        slot.active_mask = PTO2_SUBTASK_MASK_AIC | PTO2_SUBTASK_MASK_AIV0 | PTO2_SUBTASK_MASK_AIV1;
+        slot.total_required_subtasks = 3;  // 1 block * 3 active subtasks
+        slot.completed_subtasks.store(0);
+        std::atomic<int> completers{0};
+
+        auto complete = [&]() {
+            int16_t prev = slot.completed_subtasks.fetch_add(1, std::memory_order_acq_rel);
+            if ((prev + 1) == slot.total_required_subtasks) {
+                completers++;
+            }
+        };
+
+        std::thread t1(complete);
+        std::thread t2(complete);
+        std::thread t3(complete);
+        t1.join();
+        t2.join();
+        t3.join();
+
+        EXPECT_EQ(completers.load(), 1) << "Round " << round << ": exactly 1 of 3 threads triggers completion";
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Fanout lock contention: two threads trying to lock the same task
+// ---------------------------------------------------------------------------
+TEST(TaskStateEdgeTest, FanoutLockContention) {
+    PTO2TaskSlotState slot{};
+    slot.fanout_lock.store(0);
+
+    constexpr int N = 10000;
+    std::atomic<int> acquired{0};
+
+    auto lock_unlock = [&]() {
+        for (int i = 0; i < N; i++) {
+            // Spin-lock: CAS(0 -> 1)
+            int32_t expected = 0;
+            while (!slot.fanout_lock.compare_exchange_weak(
+                expected, 1, std::memory_order_acquire, std::memory_order_relaxed
+            )) {
+                expected = 0;
+            }
+            acquired++;
+            slot.fanout_lock.store(0, std::memory_order_release);
+        }
+    };
+
+    std::thread t1(lock_unlock);
+    std::thread t2(lock_unlock);
+    t1.join();
+    t2.join();
+
+    EXPECT_EQ(acquired.load(), 2 * N);
+}
+
+// ---------------------------------------------------------------------------
+// Fanin refcount: verify exactly-once ready detection
+// ---------------------------------------------------------------------------
+TEST(TaskStateEdgeTest, FaninExactlyOnceReady) {
+    constexpr int ROUNDS = 1000;
+
+    for (int round = 0; round < ROUNDS; round++) {
+        PTO2TaskSlotState slot{};
+        slot.fanin_count = 3;
+        slot.fanin_refcount.store(0);
+        std::atomic<int> ready_detectors{0};
+
+        auto release_fanin = [&]() {
+            int32_t prev = slot.fanin_refcount.fetch_add(1, std::memory_order_acq_rel);
+            if (prev + 1 == slot.fanin_count) {
+                ready_detectors++;
+            }
+        };
+
+        std::thread t1(release_fanin);
+        std::thread t2(release_fanin);
+        std::thread t3(release_fanin);
+        t1.join();
+        t2.join();
+        t3.join();
+
+        EXPECT_EQ(ready_detectors.load(), 1) << "Round " << round << ": exactly 1 thread detects task ready";
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Fanout refcount: verify exactly-once CONSUMED detection
+// ---------------------------------------------------------------------------
+TEST(TaskStateEdgeTest, FanoutExactlyOnceConsumed) {
+    constexpr int ROUNDS = 1000;
+
+    for (int round = 0; round < ROUNDS; round++) {
+        PTO2TaskSlotState slot{};
+        slot.fanout_count = 4;  // 1 scope + 3 consumers
+        slot.fanout_refcount.store(0);
+        slot.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_relaxed);
+        std::atomic<int> consumed_detectors{0};
+
+        auto release_fanout = [&]() {
+            int32_t prev = slot.fanout_refcount.fetch_add(1, std::memory_order_acq_rel);
+            if (prev + 1 == slot.fanout_count) {
+                // Only one thread should see this
+                PTO2TaskState expected = PTO2_TASK_COMPLETED;
+                if (slot.task_state.compare_exchange_strong(
+                        expected, PTO2_TASK_CONSUMED, std::memory_order_acq_rel, std::memory_order_acquire
+                    )) {
+                    consumed_detectors++;
+                }
+            }
+        };
+
+        std::vector<std::thread> threads;
+        for (int i = 0; i < 4; i++) {
+            threads.emplace_back(release_fanout);
+        }
+        for (auto &t : threads)
+            t.join();
+
+        EXPECT_EQ(consumed_detectors.load(), 1) << "Round " << round << ": exactly 1 thread detects CONSUMED";
+        EXPECT_EQ(slot.task_state.load(), PTO2_TASK_CONSUMED);
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Task state machine: full lifecycle PENDING -> READY -> RUNNING -> COMPLETED -> CONSUMED
+// ---------------------------------------------------------------------------
+TEST(TaskStateEdgeTest, FullLifecycle) {
+    PTO2TaskSlotState slot{};
+    slot.task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
+
+    // PENDING -> READY (when all fanin satisfied)
+    PTO2TaskState expected = PTO2_TASK_PENDING;
+    EXPECT_TRUE(slot.task_state.compare_exchange_strong(expected, PTO2_TASK_READY));
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_READY);
+
+    // READY -> RUNNING (when dispatched to core)
+    expected = PTO2_TASK_READY;
+    EXPECT_TRUE(slot.task_state.compare_exchange_strong(expected, PTO2_TASK_RUNNING));
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_RUNNING);
+
+    // RUNNING -> COMPLETED (when subtasks done)
+    slot.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_release);
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_COMPLETED);
+
+    // COMPLETED -> CONSUMED (when all fanout released)
+    expected = PTO2_TASK_COMPLETED;
+    EXPECT_TRUE(slot.task_state.compare_exchange_strong(expected, PTO2_TASK_CONSUMED));
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_CONSUMED);
+}
+
+// ---------------------------------------------------------------------------
+// Task state: invalid transition PENDING -> COMPLETED (skip READY/RUNNING)
+// ---------------------------------------------------------------------------
+TEST(TaskStateEdgeTest, InvalidTransition) {
+    PTO2TaskSlotState slot{};
+    slot.task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
+
+    // Try to CAS COMPLETED when state is actually PENDING -- should fail
+    PTO2TaskState expected = PTO2_TASK_COMPLETED;
+    EXPECT_FALSE(slot.task_state.compare_exchange_strong(expected, PTO2_TASK_CONSUMED))
+        << "Cannot transition from non-COMPLETED to CONSUMED";
+    EXPECT_EQ(expected, PTO2_TASK_PENDING) << "CAS returns actual state";
+}
+
+// ---------------------------------------------------------------------------
+// check_and_handle_consumed race: two threads calling simultaneously
+// Only one should succeed in the CAS(COMPLETED -> CONSUMED)
+// ---------------------------------------------------------------------------
+TEST(TaskStateEdgeTest, ConsumedRace) {
+    constexpr int ROUNDS = 1000;
+
+    for (int round = 0; round < ROUNDS; round++) {
+        PTO2TaskSlotState slot{};
+        slot.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_relaxed);
+        slot.fanout_count = 2;
+        slot.fanout_refcount.store(2, std::memory_order_relaxed);  // All released
+        std::atomic<int> consumed{0};
+
+        auto try_consume = [&]() {
+            if (slot.fanout_refcount.load() != slot.fanout_count) return;
+            PTO2TaskState exp = PTO2_TASK_COMPLETED;
+            if (slot.task_state.compare_exchange_strong(
+                    exp, PTO2_TASK_CONSUMED, std::memory_order_acq_rel, std::memory_order_acquire
+                )) {
+                consumed++;
+            }
+        };
+
+        std::thread t1(try_consume);
+        std::thread t2(try_consume);
+        t1.join();
+        t2.join();
+
+        EXPECT_EQ(consumed.load(), 1) << "Round " << round << ": exactly 1 thread succeeds in CONSUMED CAS";
+    }
+}
diff --git a/tests/ut/cpp/pto2_a2a3/test_scheduler_state.cpp b/tests/ut/cpp/pto2_a2a3/test_scheduler_state.cpp
new file mode 100644
index 000000000..dec6b0a2a
--- /dev/null
+++ b/tests/ut/cpp/pto2_a2a3/test_scheduler_state.cpp
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Unit tests for PTO2SchedulerState from pto_scheduler.h
+ *
+ * Tests task state transitions, fanin/fanout logic, subtask completion.
+ */
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <cstring>
+#include <thread>
+#include <vector>
+
+#include "pto_scheduler.h"
+
+class SchedulerStateTest : public ::testing::Test {
+protected:
+    PTO2SchedulerState sched;
+    PTO2SharedMemoryHandle *sm_handle = nullptr;
+
+    void SetUp() override {
+        sm_handle = pto2_sm_create_default();
+        ASSERT_NE(sm_handle, nullptr);
+        bool ok = pto2_scheduler_init(&sched, sm_handle->header);
+        ASSERT_TRUE(ok);
+    }
+
+    void TearDown() override {
+        pto2_scheduler_destroy(&sched);
+        if (sm_handle) {
+            pto2_sm_destroy(sm_handle);
+        }
+    }
+
+    void init_slot(
+        PTO2TaskSlotState &slot, PTO2TaskState state, int32_t fanin_count, int32_t fanout_count, uint8_t ring_id = 0
+    ) {
+        memset(&slot, 0, sizeof(slot));
+        slot.task_state.store(state);
+        slot.fanin_count = fanin_count;
+        slot.fanin_refcount.store(0);
+        slot.fanout_count = fanout_count;
+        slot.fanout_refcount.store(0);
+        slot.fanout_lock.store(0);
+        slot.fanout_head = nullptr;
+        slot.ring_id = ring_id;
+        slot.active_mask = PTO2_SUBTASK_MASK_AIC;
+        slot.completed_subtasks.store(0);
+        slot.total_required_subtasks = 1;
+        slot.logical_block_num = 1;
+    }
+};
+
+// =============================================================================
+// check_and_handle_consumed
+// =============================================================================
+
+TEST_F(SchedulerStateTest, ConsumedNotReady) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_COMPLETED, 1, 2);
+    slot.fanout_refcount.store(1);  // 1 != 2
+
+    sched.check_and_handle_consumed(slot);
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_COMPLETED);
+}
+
+TEST_F(SchedulerStateTest, ConsumedTransition) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_COMPLETED, 1, 2);
+    slot.fanout_refcount.store(2);  // matches fanout_count
+
+    sched.check_and_handle_consumed(slot);
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_CONSUMED);
+}
+
+TEST_F(SchedulerStateTest, ConsumedNotCompletedState) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_RUNNING, 1, 1);
+    slot.fanout_refcount.store(1);
+
+    sched.check_and_handle_consumed(slot);
+    // CAS fails because state is RUNNING, not COMPLETED
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_RUNNING);
+}
+
+TEST_F(SchedulerStateTest, ConsumedIdempotent) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_CONSUMED, 1, 1);
+    slot.fanout_refcount.store(1);
+
+    sched.check_and_handle_consumed(slot);
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_CONSUMED);
+}
+
+// =============================================================================
+// release_producer
+// =============================================================================
+
+TEST_F(SchedulerStateTest, ReleaseProducerIncrements) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_COMPLETED, 1, 3);
+
+    sched.release_producer(slot);
+    EXPECT_EQ(slot.fanout_refcount.load(), 1);
+
+    sched.release_producer(slot);
+    EXPECT_EQ(slot.fanout_refcount.load(), 2);
+}
+
+TEST_F(SchedulerStateTest, ReleaseProducerTriggersConsumed) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_COMPLETED, 1, 2);
+    slot.fanout_refcount.store(1);  // One away
+
+    sched.release_producer(slot);
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_CONSUMED);
+}
+
+// =============================================================================
+// release_fanin_and_check_ready
+// =============================================================================
+
+TEST_F(SchedulerStateTest, FaninPartialNotReady) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_PENDING, 3, 1);
+
+    bool ready = sched.release_fanin_and_check_ready(slot);
+    EXPECT_FALSE(ready);
+    EXPECT_EQ(slot.fanin_refcount.load(), 1);
+}
+
+TEST_F(SchedulerStateTest, FaninAllSatisfiedReady) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_PENDING, 1, 1);
+
+    bool ready = sched.release_fanin_and_check_ready(slot);
+    EXPECT_TRUE(ready);
+}
+
+// =============================================================================
+// on_subtask_complete
+// =============================================================================
+
+TEST_F(SchedulerStateTest, SubtaskCompleteSingle) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_RUNNING, 1, 1);
+    slot.total_required_subtasks = 1;
+    slot.completed_subtasks.store(0);
+
+    EXPECT_TRUE(sched.on_subtask_complete(slot));
+}
+
+TEST_F(SchedulerStateTest, SubtaskCompleteMultiBlock) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_RUNNING, 1, 1);
+    slot.total_required_subtasks = 6;  // 3 cores * 2 blocks
+    slot.completed_subtasks.store(0);
+
+    for (int i = 0; i < 5; i++) {
+        EXPECT_FALSE(sched.on_subtask_complete(slot));
+    }
+    EXPECT_TRUE(sched.on_subtask_complete(slot));
+}
+
+TEST_F(SchedulerStateTest, SubtaskCompleteConcurrent) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_RUNNING, 1, 1);
+    slot.total_required_subtasks = 6;
+    slot.completed_subtasks.store(0);
+
+    std::atomic<int> true_count{0};
+    std::vector<std::thread> threads;
+    for (int i = 0; i < 6; i++) {
+        threads.emplace_back([&]() {
+            if (sched.on_subtask_complete(slot)) {
+                true_count.fetch_add(1);
+            }
+        });
+    }
+    for (auto &t : threads)
+        t.join();
+
+    EXPECT_EQ(true_count.load(), 1);
+    EXPECT_EQ(slot.completed_subtasks.load(), 6);
+}
+
+// =============================================================================
+// on_scope_end
+// =============================================================================
+
+TEST_F(SchedulerStateTest, ScopeEndBatchRelease) {
+    constexpr int N = 4;
+    alignas(64) PTO2TaskSlotState slots[N];
+    PTO2TaskSlotState *ptrs[N];
+
+    for (int i = 0; i < N; i++) {
+        init_slot(slots[i], PTO2_TASK_COMPLETED, 1, 2);
+        ptrs[i] = &slots[i];
+    }
+
+    sched.on_scope_end(ptrs, N);
+
+    for (int i = 0; i < N; i++) {
+        EXPECT_EQ(slots[i].fanout_refcount.load(), 1);
+    }
+}
+
+// =============================================================================
+// get_ready_tasks_batch: local buffer first
+// =============================================================================
+
+TEST_F(SchedulerStateTest, GetReadyTasksBatchLocalFirst) {
+    alignas(64) PTO2TaskSlotState slot_a, slot_b;
+    init_slot(slot_a, PTO2_TASK_READY, 0, 1);
+    init_slot(slot_b, PTO2_TASK_READY, 0, 1);
+
+    PTO2TaskSlotState *local_buf_storage[4];
+    PTO2LocalReadyBuffer local_buf;
+    local_buf.reset(local_buf_storage, 4);
+    local_buf.try_push(&slot_a);
+
+    // Push slot_b to global queue
+    sched.ready_queues[0].push(&slot_b);
+
+    PTO2TaskSlotState *out[4];
+    int count = sched.get_ready_tasks_batch(PTO2ResourceShape::AIC, local_buf, out, 4);
+
+    EXPECT_EQ(count, 2);
+    // Local buffer drains first (LIFO), so slot_a comes first
+    EXPECT_EQ(out[0], &slot_a);
+    EXPECT_EQ(out[1], &slot_b);
+}
diff --git a/tests/ut/cpp/pto2_a2a3/test_shared_memory.cpp b/tests/ut/cpp/pto2_a2a3/test_shared_memory.cpp
new file mode 100644
index 000000000..025a11dbf
--- /dev/null
+++ b/tests/ut/cpp/pto2_a2a3/test_shared_memory.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Unit tests for PTO2SharedMemory layout from pto_shared_memory.h
+ */
+
+#include <gtest/gtest.h>
+
+#include "pto_shared_memory.h"
+
+class SharedMemoryTest : public ::testing::Test {
+protected:
+    PTO2SharedMemoryHandle *handle = nullptr;
+
+    void SetUp() override {
+        handle = pto2_sm_create_default();
+        ASSERT_NE(handle, nullptr);
+    }
+
+    void TearDown() override {
+        if (handle) {
+            pto2_sm_destroy(handle);
+            handle = nullptr;
+        }
+    }
+};
+
+TEST_F(SharedMemoryTest, CreateDefaultReturnsNonNull) {
+    EXPECT_NE(handle->sm_base, nullptr);
+    EXPECT_GT(handle->sm_size, 0u);
+}
+
+TEST_F(SharedMemoryTest, IsOwner) { EXPECT_TRUE(handle->is_owner); }
+
+TEST_F(SharedMemoryTest, HeaderInitValues) {
+    auto *hdr = handle->header;
+    EXPECT_EQ(hdr->orchestrator_done.load(), 0);
+    EXPECT_EQ(hdr->orch_error_code.load(), 0);
+    EXPECT_EQ(hdr->sched_error_bitmap.load(), 0);
+    EXPECT_EQ(hdr->sched_error_code.load(), 0);
+
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        auto &fc = hdr->rings[r].fc;
+        EXPECT_EQ(fc.current_task_index.load(), 0);
+        EXPECT_EQ(fc.last_task_alive.load(), 0);
+    }
+}
+
+TEST_F(SharedMemoryTest, Validate) { EXPECT_TRUE(pto2_sm_validate(handle)); }
+
+TEST_F(SharedMemoryTest, PerRingIndependence) {
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        EXPECT_NE(handle->header->rings[r].task_descriptors, nullptr) << "Ring " << r;
+        EXPECT_NE(handle->header->rings[r].task_payloads, nullptr) << "Ring " << r;
+    }
+    // Different rings should have different pointers
+    for (int r = 1; r < PTO2_MAX_RING_DEPTH; r++) {
+        EXPECT_NE(handle->header->rings[r].task_descriptors, handle->header->rings[0].task_descriptors) << "Ring " << r;
+    }
+}
+
+TEST_F(SharedMemoryTest, PointerAlignment) {
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        auto addr = reinterpret_cast<uintptr_t>(handle->header->rings[r].task_descriptors);
+        EXPECT_EQ(addr % PTO2_ALIGN_SIZE, 0u) << "Ring " << r << " descriptors not aligned";
+    }
+}
+
+TEST(SharedMemoryCalcSize, NonZero) {
+    uint64_t size = pto2_sm_calculate_size(PTO2_TASK_WINDOW_SIZE);
+    EXPECT_GT(size, 0u);
+}
+
+TEST(SharedMemoryCalcSize, LargerWindowGivesLargerSize) {
+    uint64_t small_size = pto2_sm_calculate_size(64);
+    uint64_t large_size = pto2_sm_calculate_size(256);
+    EXPECT_GT(large_size, small_size);
+}
+
+TEST(SharedMemoryCalcSize, HeaderAligned) { EXPECT_EQ(sizeof(PTO2SharedMemoryHeader) % PTO2_ALIGN_SIZE, 0u); }
diff --git a/tests/ut/cpp/pto2_a2a3/test_submit_types.cpp b/tests/ut/cpp/pto2_a2a3/test_submit_types.cpp
new file mode 100644
index 000000000..f9e0d7641
--- /dev/null
+++ b/tests/ut/cpp/pto2_a2a3/test_submit_types.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Unit tests for pto_submit_types.h
+ *
+ * Tests submit contract types: subtask masks, resource shapes,
+ * active mask derivation, and launch spec.
+ */
+
+#include <gtest/gtest.h>
+
+#include "pto_submit_types.h"
+
+// =============================================================================
+// pto2_subtask_active
+// =============================================================================
+
+TEST(SubtaskActive, AICMaskActivatesAICSlot) {
+    EXPECT_TRUE(pto2_subtask_active(PTO2_SUBTASK_MASK_AIC, PTO2SubtaskSlot::AIC));
+    EXPECT_FALSE(pto2_subtask_active(PTO2_SUBTASK_MASK_AIC, PTO2SubtaskSlot::AIV0));
+    EXPECT_FALSE(pto2_subtask_active(PTO2_SUBTASK_MASK_AIC, PTO2SubtaskSlot::AIV1));
+}
+
+TEST(SubtaskActive, AIV0MaskActivatesAIV0Slot) {
+    EXPECT_FALSE(pto2_subtask_active(PTO2_SUBTASK_MASK_AIV0, PTO2SubtaskSlot::AIC));
+    EXPECT_TRUE(pto2_subtask_active(PTO2_SUBTASK_MASK_AIV0, PTO2SubtaskSlot::AIV0));
+    EXPECT_FALSE(pto2_subtask_active(PTO2_SUBTASK_MASK_AIV0, PTO2SubtaskSlot::AIV1));
+}
+
+TEST(SubtaskActive, AIV1MaskActivatesAIV1Slot) {
+    EXPECT_FALSE(pto2_subtask_active(PTO2_SUBTASK_MASK_AIV1, PTO2SubtaskSlot::AIC));
+    EXPECT_FALSE(pto2_subtask_active(PTO2_SUBTASK_MASK_AIV1, PTO2SubtaskSlot::AIV0));
+    EXPECT_TRUE(pto2_subtask_active(PTO2_SUBTASK_MASK_AIV1, PTO2SubtaskSlot::AIV1));
+}
+
+TEST(SubtaskActive, CombinedMask) {
+    uint8_t mask = PTO2_SUBTASK_MASK_AIC | PTO2_SUBTASK_MASK_AIV1;
+    EXPECT_TRUE(pto2_subtask_active(mask, PTO2SubtaskSlot::AIC));
+    EXPECT_FALSE(pto2_subtask_active(mask, PTO2SubtaskSlot::AIV0));
+    EXPECT_TRUE(pto2_subtask_active(mask, PTO2SubtaskSlot::AIV1));
+}
+
+TEST(SubtaskActive, AllActive) {
+    uint8_t mask = PTO2_SUBTASK_MASK_AIC | PTO2_SUBTASK_MASK_AIV0 | PTO2_SUBTASK_MASK_AIV1;
+    EXPECT_TRUE(pto2_subtask_active(mask, PTO2SubtaskSlot::AIC));
+    EXPECT_TRUE(pto2_subtask_active(mask, PTO2SubtaskSlot::AIV0));
+    EXPECT_TRUE(pto2_subtask_active(mask, PTO2SubtaskSlot::AIV1));
+}
+
+// =============================================================================
+// pto2_active_mask_to_shape
+// =============================================================================
+
+TEST(ActiveMaskToShape, SingleAIC) {
+    EXPECT_EQ(pto2_active_mask_to_shape(PTO2_SUBTASK_MASK_AIC), PTO2ResourceShape::AIC);
+}
+
+TEST(ActiveMaskToShape, SingleAIV0) {
+    EXPECT_EQ(pto2_active_mask_to_shape(PTO2_SUBTASK_MASK_AIV0), PTO2ResourceShape::AIV);
+}
+
+TEST(ActiveMaskToShape, SingleAIV1) {
+    EXPECT_EQ(pto2_active_mask_to_shape(PTO2_SUBTASK_MASK_AIV1), PTO2ResourceShape::AIV);
+}
+
+TEST(ActiveMaskToShape, TwoActiveBecomesMIX) {
+    uint8_t mask = PTO2_SUBTASK_MASK_AIC | PTO2_SUBTASK_MASK_AIV0;
+    EXPECT_EQ(pto2_active_mask_to_shape(mask), PTO2ResourceShape::MIX);
+}
+
+TEST(ActiveMaskToShape, AllThreeBecomesMIX) {
+    uint8_t mask = PTO2_SUBTASK_MASK_AIC | PTO2_SUBTASK_MASK_AIV0 | PTO2_SUBTASK_MASK_AIV1;
+    EXPECT_EQ(pto2_active_mask_to_shape(mask), PTO2ResourceShape::MIX);
+}
+
+// =============================================================================
+// pto2_mixed_kernels_to_active_mask
+// =============================================================================
+
+TEST(MixedKernelsToMask, AllInvalid) {
+    MixedKernels mk;
+    EXPECT_EQ(pto2_mixed_kernels_to_active_mask(mk), 0);
+}
+
+TEST(MixedKernelsToMask, AICOnly) {
+    MixedKernels mk;
+    mk.aic_kernel_id = 42;
+    EXPECT_EQ(pto2_mixed_kernels_to_active_mask(mk), PTO2_SUBTASK_MASK_AIC);
+}
+
+TEST(MixedKernelsToMask, AIV0Only) {
+    MixedKernels mk;
+    mk.aiv0_kernel_id = 7;
+    EXPECT_EQ(pto2_mixed_kernels_to_active_mask(mk), PTO2_SUBTASK_MASK_AIV0);
+}
+
+TEST(MixedKernelsToMask, AllValid) {
+    MixedKernels mk;
+    mk.aic_kernel_id = 1;
+    mk.aiv0_kernel_id = 2;
+    mk.aiv1_kernel_id = 3;
+    uint8_t expected = PTO2_SUBTASK_MASK_AIC | PTO2_SUBTASK_MASK_AIV0 | PTO2_SUBTASK_MASK_AIV1;
+    EXPECT_EQ(pto2_mixed_kernels_to_active_mask(mk), expected);
+}
+
+// =============================================================================
+// MixedKernels defaults
+// =============================================================================
+
+TEST(MixedKernels, DefaultsAreInvalid) {
+    MixedKernels mk;
+    EXPECT_EQ(mk.aic_kernel_id, INVALID_KERNEL_ID);
+    EXPECT_EQ(mk.aiv0_kernel_id, INVALID_KERNEL_ID);
+    EXPECT_EQ(mk.aiv1_kernel_id, INVALID_KERNEL_ID);
+}
+
+// =============================================================================
+// PTO2LaunchSpec
+// =============================================================================
+
+TEST(LaunchSpec, DefaultBlockNumIsOne) {
+    PTO2LaunchSpec spec;
+    EXPECT_EQ(spec.block_num(), 1);
+}
+
+TEST(LaunchSpec, SetAndGet) {
+    PTO2LaunchSpec spec;
+    spec.set_block_num(4);
+    EXPECT_EQ(spec.block_num(), 4);
+}
+
+// =============================================================================
+// Constants
+// =============================================================================
+
+TEST(Constants, SubtaskSlotCount) { EXPECT_EQ(PTO2_SUBTASK_SLOT_COUNT, 3); }
+
+TEST(Constants, NumResourceShapes) { EXPECT_EQ(PTO2_NUM_RESOURCE_SHAPES, 3); }
+
+TEST(Constants, InvalidKernelId) { EXPECT_EQ(INVALID_KERNEL_ID, -1); }
diff --git a/tests/ut/cpp/pto2_a2a3/test_tensormap_edge.cpp b/tests/ut/cpp/pto2_a2a3/test_tensormap_edge.cpp
new file mode 100644
index 000000000..db409ac57
--- /dev/null
+++ b/tests/ut/cpp/pto2_a2a3/test_tensormap_edge.cpp
@@ -0,0 +1,632 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Edge-case tests for TensorMap and Tensor overlap detection.
+ *
+ * ============================================================================
+ * ANALYSIS FINDINGS -- check_overlap() in PTO2TensorMapEntry
+ * ============================================================================
+ *
+ * BUG-CANDIDATE-1 (Overlap fast path): check_overlap() loops for
+ *   entry->ndims, reading input.shapes[i] for all i < entry->ndims.
+ *   When input has fewer dimensions, shapes[i] beyond input->ndims are
+ *   stale (Tensor::init only copies ndims elements).  The result is
+ *   non-deterministic -- depends on whatever value happens to be in memory.
+ *   The test poisons input.shapes[1] to make the stale read deterministic
+ *   and proves the loop bound is wrong.
+ *
+ * BUG-CANDIDATE-2 (Overlap slow path): The slow path constructs Segment from
+ *   offsets and shapes.  But it uses `uint64_t in_off = input.offsets[i]` when
+ *   `input.is_all_offset_zero` is false.  If ndims < RUNTIME_MAX_TENSOR_DIMS,
+ *   offsets[ndims..4] may be uninitialized garbage.  The loop runs for
+ *   entry->ndims iterations, which could exceed input->ndims.
+ *   -> Actually the loop runs for `ndims` which is the ENTRY's ndims.
+ *     If entry->ndims > input->ndims, input->shapes[i] beyond input->ndims is 0.
+ *     Segment{in_off, in_off + 0} has length 0 -> intersection is always false
+ *     -> returns NO_OVERLAP.  This might be wrong if the extra dimensions
+ *     are broadcast or don't exist.
+ *
+ * BUG-CANDIDATE-3 (Dimension mismatch): check_overlap uses entry->ndims
+ *   exclusively, ignoring input->ndims.  If input has MORE dimensions than
+ *   entry, the extra input dimensions are never checked.  This could miss
+ *   partial overlaps in higher dimensions.
+ *
+ * BUG-CANDIDATE-4 (Lookup result saturation): PTO2_LOOKUP_MAX_RESULTS = 16.
+ *   If more than 16 overlapping entries exist, results are silently dropped.
+ *   This means dependencies can be missed in highly-connected graphs.
+ *
+ * BUG-CANDIDATE-5 (TensorMap new_entry pool exhaustion): new_entry() calls
+ *   `always_assert(next_entry_idx < pool_size)` which throws/aborts when the
+ *   pool is fully used AND free_list is empty.  There's no graceful fallback.
+ *
+ * BUG-CANDIDATE-6 (Hash collision with cleanup): DISMISSED.
+ *   cleanup_retired() uses debug_assert to verify entry belongs to the
+ *   retiring task.  In theory, if the cleanup range exceeds task_window_size,
+ *   slot reuse causes ABA.  However, sync_tensormap()'s overlap check
+ *   (pto_tensormap.cpp:244) triggers cleanup every time the current task's
+ *   slot collides with last_cleanup, bounding the cleanup range to at most
+ *   task_window_size.  This guarantees each slot maps to exactly one task
+ *   in any cleanup pass.  The scenario is unreachable in production.
+ *
+ * BUG-CANDIDATE-7 (copy_from_tensor doesn't zero beyond ndims): When
+ *   copying shapes[]/offsets[] from Tensor to Entry, only ndims elements
+ *   are copied.  shapes[ndims..4] retain whatever was in the entry before
+ *   (from pool reuse).  check_overlap loops for entry->ndims, so garbage
+ *   data beyond ndims could affect overlap detection if the loop ever
+ *   reads beyond what was copied.  Currently safe because the loop uses
+ *   entry->ndims which matches what was copied, but fragile.
+ *
+ * ============================================================================
+ * ANALYSIS FINDINGS -- Tensor struct
+ * ============================================================================
+ *
+ * EDGE-1: Tensor with 0 dimensions (ndims=0).  No shapes/offsets.
+ *   check_overlap loop doesn't execute -> returns COVERED (fast path, contains=true).
+ *   Two 0-dim tensors at same addr are always "covered".
+ *
+ * EDGE-2: Tensor with maximum dimensions (ndims=5).
+ *   All shape/offset arrays fully used.
+ *
+ * EDGE-3: Shape of 0 in one dimension.  Segment = {off, off+0} = empty.
+ *   line_segment_intersection({off, off+0}, {x,y}) = (off+0 > x) && (y > off)
+ *   = (off > x) && (y > off).  Empty segment may or may not intersect.
+ *
+ * EDGE-4: Cleanup ABA -- DISMISSED.  sync_tensormap()'s overlap check
+ *   bounds cleanup range to at most task_window_size, so a single slot never
+ *   maps to two different tasks within one cleanup_retired() call.
+ */
+
+#include <gtest/gtest.h>
+#include <cstring>
+#include <set>
+#include "common.h"
+#include "pto_tensormap.h"
+#include "pto_orchestration_api.h"
+
+// =============================================================================
+// Helpers
+// =============================================================================
+
+static Tensor
+make_tensor_nd(uint64_t addr, uint32_t ndims, const uint32_t shapes[], const uint32_t offsets[], int32_t version = 0) {
+    // Seed with make_tensor_external() (Tensor's default ctor is private).
+    // Use a dummy 1-dim shape for the seed; we overwrite everything via init().
+    uint32_t seed_shape[1] = {1};
+    Tensor t = make_tensor_external(
+        reinterpret_cast<void *>(addr), seed_shape, 1, DataType::FLOAT32, /*manual_dep=*/false, /*version=*/0
+    );
+    uint32_t s[RUNTIME_MAX_TENSOR_DIMS]{};
+    uint32_t rs[RUNTIME_MAX_TENSOR_DIMS]{};
+    uint32_t o[RUNTIME_MAX_TENSOR_DIMS]{};
+    bool all_zero = true;
+    for (uint32_t i = 0; i < ndims && i < RUNTIME_MAX_TENSOR_DIMS; i++) {
+        s[i] = shapes[i];
+        rs[i] = shapes[i];
+        o[i] = offsets ? offsets[i] : 0;
+        if (o[i] != 0) all_zero = false;
+    }
+    uint64_t total = 4;
+    for (uint32_t i = 0; i < ndims; i++)
+        total *= (rs[i] + (offsets ? offsets[i] : 0));
+    t.init((void *)addr, total, rs, s, o, ndims, DataType::FLOAT32, version, all_zero, /*is_raw_eq_shapes=*/true);
+    return t;
+}
+
+class TensorMapEdgeTest : public ::testing::Test {
+protected:
+    PTO2TensorMap tmap{};
+    int32_t window_sizes[PTO2_MAX_RING_DEPTH]{};
+
+    void SetUp() override {
+        for (int i = 0; i < PTO2_MAX_RING_DEPTH; i++)
+            window_sizes[i] = 64;
+        ASSERT_TRUE(tmap.init(256, 512, window_sizes));
+    }
+    void TearDown() override { tmap.destroy(); }
+};
+
+
+// ---------------------------------------------------------------------------
+// EDGE-1: Zero dimensions (ndims=0)
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, ZeroDimensionTensor) {
+    uint32_t seed_shape[1] = {1};
+    Tensor t = make_tensor_external(reinterpret_cast<void *>(0x2000), seed_shape, 1, DataType::FLOAT32, false, 0);
+    uint32_t s[5]{}, o[5]{};
+    t.init((void *)0x2000, 0, s, s, o, 0, DataType::FLOAT32, 0, true, true);
+
+    PTO2TaskId task = PTO2TaskId::make(0, 0);
+    tmap.insert(t, task);
+
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(t, result);
+
+    EXPECT_GE(result.count, 1);
+    if (result.count > 0) {
+        // ndims=0: fast-path loop doesn't execute, contains=true -> COVERED
+        EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED);
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Zero dimensions: Two different 0-dim tensors at same address always COVERED
+// This is semantically questionable -- should scalar tensors be independent?
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, TwoZeroDimTensorsSameAddr) {
+    uint32_t seed_shape[1] = {1};
+    Tensor t1 = make_tensor_external(reinterpret_cast<void *>(0x2100), seed_shape, 1, DataType::FLOAT32, false, 0);
+    Tensor t2 = make_tensor_external(reinterpret_cast<void *>(0x2100), seed_shape, 1, DataType::FLOAT32, false, 0);
+    uint32_t s[5]{}, o[5]{};
+    t1.init((void *)0x2100, 0, s, s, o, 0, DataType::FLOAT32, 0, true, true);
+    t2.init((void *)0x2100, 0, s, s, o, 0, DataType::FLOAT32, 0, true, true);
+
+    tmap.insert(t1, PTO2TaskId::make(0, 0));
+    tmap.insert(t2, PTO2TaskId::make(0, 1));
+
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(t1, result);
+
+    // Both 0-dim entries report COVERED for any 0-dim input at same addr
+    EXPECT_EQ(result.count, 2);
+    for (int i = 0; i < result.count; i++) {
+        EXPECT_EQ(result.entries[i].overlap_status, OverlapStatus::COVERED)
+            << "0-dim tensors always report COVERED (empty loop -> contains=true)";
+    }
+}
+
+// ---------------------------------------------------------------------------
+// BUG-CANDIDATE-4: Lookup result saturation (>16 producers)
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, LookupResultSaturation) {
+    uint32_t shapes[] = {100};
+    Tensor t = make_tensor_nd(0x4000, 1, shapes, nullptr, 0);
+
+    // Insert 20 producers for the same tensor
+    for (int i = 0; i < 20; i++) {
+        tmap.insert(t, PTO2TaskId::make(0, i));
+    }
+
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(t, result);
+
+    // Only 16 results fit -- 4 dependencies are silently dropped
+    EXPECT_EQ(result.count, PTO2_LOOKUP_MAX_RESULTS)
+        << "More than 16 overlapping producers: results saturated, deps missed";
+}
+
+// ---------------------------------------------------------------------------
+// BUG-CANDIDATE-4 extended: Saturation drops OLDEST producers (newest first)
+// Because insert() adds at head of bucket chain, lookup traverses newest first.
+// The first 16 (newest) entries fill the result, dropping the 4 oldest.
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, LookupSaturationDropsOldest) {
+    uint32_t shapes[] = {100};
+    Tensor t = make_tensor_nd(0x4100, 1, shapes, nullptr, 0);
+
+    for (int i = 0; i < 20; i++) {
+        tmap.insert(t, PTO2TaskId::make(0, i));
+    }
+
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(t, result);
+
+    ASSERT_EQ(result.count, PTO2_LOOKUP_MAX_RESULTS);
+
+    // Verify the kept results are the newest 16 (tasks 19, 18, ..., 4)
+    // and the oldest 4 (tasks 0, 1, 2, 3) are dropped
+    for (int i = 0; i < result.count; i++) {
+        int32_t local_id = result.entries[i].entry->producer_task_id.local();
+        // The newest entries are inserted at head, so lookup sees them first
+        EXPECT_GE(local_id, 4) << "Oldest tasks (0-3) should be the ones dropped by saturation";
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Version-based overlap: newer version returns OTHER
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, VersionMismatchReturnsOther) {
+    uint32_t shapes[] = {100};
+    Tensor v0 = make_tensor_nd(0x5000, 1, shapes, nullptr, 0);
+    Tensor v1 = make_tensor_nd(0x5000, 1, shapes, nullptr, 1);
+
+    tmap.insert(v0, PTO2TaskId::make(0, 0));
+
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(v1, result);
+
+    EXPECT_EQ(result.count, 1);
+    // Version 1 > Version 0 -> OTHER (not COVERED)
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::OTHER);
+}
+
+// ---------------------------------------------------------------------------
+// Version: Same version, same shapes -> COVERED
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, SameVersionSameShapesCovered) {
+    uint32_t shapes[] = {100};
+    Tensor t = make_tensor_nd(0x5100, 1, shapes, nullptr, 0);
+
+    tmap.insert(t, PTO2TaskId::make(0, 0));
+
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(t, result);
+
+    EXPECT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED) << "Same version + same shapes -> COVERED";
+}
+
+// ---------------------------------------------------------------------------
+// Partial overlap 1D: [0:100] vs [50:150]
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, PartialOverlap1D) {
+    uint32_t prod_shapes[] = {100};
+    Tensor prod = make_tensor_nd(0x6000, 1, prod_shapes, nullptr, 0);
+    tmap.insert(prod, PTO2TaskId::make(0, 0));
+
+    // Consumer reads [50:150] -- partial overlap
+    uint32_t cons_shapes[] = {100};
+    uint32_t cons_offsets[] = {50};
+    Tensor cons = make_tensor_nd(0x6000, 1, cons_shapes, cons_offsets, 0);
+
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(cons, result);
+
+    EXPECT_EQ(result.count, 1);
+    // Consumer [50,150) vs Producer [0,100) -> intersection = [50,100).
+    // Consumer does NOT contain producer -> OTHER
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::OTHER);
+}
+
+// ---------------------------------------------------------------------------
+// Consumer fully covers producer: COVERED
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, ConsumerCoversProducer) {
+    // Producer writes [10:20]
+    uint32_t prod_shapes[] = {10};
+    uint32_t prod_offsets[] = {10};
+    Tensor prod = make_tensor_nd(0x7000, 1, prod_shapes, prod_offsets, 0);
+    tmap.insert(prod, PTO2TaskId::make(0, 0));
+
+    // Consumer reads [0:100] -- fully covers producer
+    uint32_t cons_shapes[] = {100};
+    Tensor cons = make_tensor_nd(0x7000, 1, cons_shapes, nullptr, 0);
+
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(cons, result);
+
+    EXPECT_EQ(result.count, 1);
+    // Consumer [0,100) contains Producer [10,20) -> COVERED
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED);
+}
+
+// ---------------------------------------------------------------------------
+// Adjacent regions: [0:100] vs [100:200] -> NO_OVERLAP
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, AdjacentNoOverlap) {
+    uint32_t prod_shapes[] = {100};
+    Tensor prod = make_tensor_nd(0x8000, 1, prod_shapes, nullptr, 0);
+    tmap.insert(prod, PTO2TaskId::make(0, 0));
+
+    uint32_t cons_shapes[] = {100};
+    uint32_t cons_offsets[] = {100};
+    Tensor cons = make_tensor_nd(0x8000, 1, cons_shapes, cons_offsets, 0);
+
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(cons, result);
+
+    // [0,100) vs [100,200) -> end(100) > begin(100)? No -> NO_OVERLAP
+    EXPECT_EQ(result.count, 0);
+}
+
+// ---------------------------------------------------------------------------
+// One-element overlap: [0:100] vs [99:199]
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, OneElementOverlap) {
+    uint32_t prod_shapes[] = {100};
+    Tensor prod = make_tensor_nd(0x8100, 1, prod_shapes, nullptr, 0);
+    tmap.insert(prod, PTO2TaskId::make(0, 0));
+
+    uint32_t cons_shapes[] = {100};
+    uint32_t cons_offsets[] = {99};
+    Tensor cons = make_tensor_nd(0x8100, 1, cons_shapes, cons_offsets, 0);
+
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(cons, result);
+
+    // [0,100) vs [99,199) -> intersection = [99,100) = 1 element
+    EXPECT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::OTHER) << "Partial overlap (1 element) -> OTHER";
+}
+
+// ---------------------------------------------------------------------------
+// EDGE-3: Shape of 0 in one dimension (empty segment behavior)
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, ZeroShapeInDimension) {
+    // Producer: 2D [10, 0] -- zero in dim 1
+    uint32_t prod_shapes[] = {10, 0};
+    Tensor prod = make_tensor_nd(0x8200, 2, prod_shapes, nullptr, 0);
+    tmap.insert(prod, PTO2TaskId::make(0, 0));
+
+    // Consumer: 2D [10, 20]
+    uint32_t cons_shapes[] = {10, 20};
+    Tensor cons = make_tensor_nd(0x8200, 2, cons_shapes, nullptr, 0);
+
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(cons, result);
+
+    if (result.count > 0) {
+        // Fast path: input.shapes[1](20) < entry.shapes[1](0)? No, 20 >= 0.
+        // -> contains = true -> COVERED.
+        // But the producer wrote ZERO elements in dim 1!
+        // Should a zero-area producer be "covered" by any consumer?
+        // This is semantically questionable.
+        EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED)
+            << "Zero-shape producer is COVERED by any consumer (empty production)";
+    }
+}
+
+// ---------------------------------------------------------------------------
+// 2D overlap: different slices
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, MultiDimOverlap) {
+    // Producer: 2D [10, 20] at offset [0, 0]
+    uint32_t prod_shapes[] = {10, 20};
+    Tensor prod = make_tensor_nd(0x9000, 2, prod_shapes, nullptr, 0);
+    tmap.insert(prod, PTO2TaskId::make(0, 0));
+
+    // Consumer: 2D [5, 10] at offset [2, 5] -- overlaps partially
+    uint32_t cons_shapes[] = {5, 10};
+    uint32_t cons_offsets[] = {2, 5};
+    Tensor cons = make_tensor_nd(0x9000, 2, cons_shapes, cons_offsets, 0);
+
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(cons, result);
+
+    EXPECT_EQ(result.count, 1);
+    // Consumer [2,7)x[5,15) vs Producer [0,10)x[0,20)
+    // check_overlap checks if INPUT(consumer) contains ENTRY(producer):
+    // Dim 0: consumer [2,7) does NOT contain producer [0,10) -> contains=false
+    // Dim 1: consumer [5,15) does NOT contain producer [0,20) -> contains=false
+    // All dims intersect, but consumer doesn't fully cover -> OTHER
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::OTHER)
+        << "Consumer sub-region inside producer: overlap exists but not COVERED";
+}
+
+// ---------------------------------------------------------------------------
+// 2D: Consumer exceeds producer in one dimension -> OTHER
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, MultiDimPartialOverlap) {
+    uint32_t prod_shapes[] = {10, 20};
+    Tensor prod = make_tensor_nd(0x9100, 2, prod_shapes, nullptr, 0);
+    tmap.insert(prod, PTO2TaskId::make(0, 0));
+
+    // Consumer: [8, 25] -- exceeds producer in dim 1 (25 > 20)
+    uint32_t cons_shapes[] = {8, 25};
+    Tensor cons = make_tensor_nd(0x9100, 2, cons_shapes, nullptr, 0);
+
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(cons, result);
+
+    EXPECT_EQ(result.count, 1);
+    // Fast path: shapes comparison
+    // input.shapes[0]=8 >= entry.shapes[0]=10? No -> contains=false -> OTHER
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::OTHER);
+}
+
+// ---------------------------------------------------------------------------
+// 5D full overlap test (maximum dimensions)
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, FullFiveDimensionalOverlap) {
+    uint32_t prod_shapes[] = {2, 3, 4, 5, 6};
+    Tensor prod = make_tensor_nd(0x9200, 5, prod_shapes, nullptr, 0);
+    tmap.insert(prod, PTO2TaskId::make(0, 0));
+
+    // Consumer with larger shapes in all dims -> COVERED
+    uint32_t cons_shapes[] = {4, 6, 8, 10, 12};
+    Tensor cons = make_tensor_nd(0x9200, 5, cons_shapes, nullptr, 0);
+
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(cons, result);
+
+    EXPECT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED)
+        << "5D consumer covers 5D producer in all dimensions";
+}
+
+// ---------------------------------------------------------------------------
+// Cleanup then insert: verify chain integrity
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, CleanupThenReuseSlot) {
+    uint32_t shapes[] = {100};
+    Tensor t = make_tensor_nd(0xA000, 1, shapes, nullptr, 0);
+
+    // Insert entries for tasks 0-7
+    for (int i = 0; i < 8; i++) {
+        tmap.insert(t, PTO2TaskId::make(0, i));
+    }
+    EXPECT_EQ(tmap.valid_count(), 8);
+
+    // Cleanup tasks 0-4
+    tmap.cleanup_retired(0, 0, 5);
+    tmap.sync_validity(0, 5);
+    EXPECT_EQ(tmap.valid_count(), 3);  // tasks 5,6,7 remain
+
+    // Re-insert with new task IDs that reuse slots 0-4
+    // (task window = 64, so IDs 64-68 map to slots 0-4)
+    for (int i = 64; i < 69; i++) {
+        tmap.insert(t, PTO2TaskId::make(0, i));
+    }
+
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(t, result);
+
+    // Should find 8 entries: 3 old (5,6,7) + 5 new (64-68)
+    EXPECT_EQ(result.count, 8);
+}
+
+// ---------------------------------------------------------------------------
+// Hash distribution: addresses that are multiples of common alignment
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, HashDistributionAlignedAddresses) {
+    // Typical device addresses are 256-byte or 1024-byte aligned
+    // The hash function should distribute these well
+    std::set<uint32_t> buckets_used;
+    for (int i = 0; i < 100; i++) {
+        uint64_t addr = 0x10000 + i * 1024;
+        uint32_t bucket = tmap.hash(addr);
+        buckets_used.insert(bucket);
+    }
+    // With 256 buckets and 100 addresses, we should use many distinct buckets
+    // (poor hash would cluster aligned addresses into few buckets)
+    EXPECT_GT(buckets_used.size(), 50u) << "Hash should distribute 1024-aligned addresses across many buckets";
+}
+
+// ---------------------------------------------------------------------------
+// Lookup on empty TensorMap
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, LookupEmpty) {
+    uint32_t shapes[] = {100};
+    Tensor t = make_tensor_nd(0xC000, 1, shapes, nullptr, 0);
+
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(t, result);
+
+    EXPECT_EQ(result.count, 0) << "Empty TensorMap returns no results";
+}
+
+// ---------------------------------------------------------------------------
+// Lazy invalidation: entries become stale when last_task_alive advances
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, LazyInvalidation) {
+    uint32_t shapes[] = {100};
+    Tensor t = make_tensor_nd(0xD000, 1, shapes, nullptr, 0);
+
+    // Insert entries for tasks 0-4
+    for (int i = 0; i < 5; i++) {
+        tmap.insert(t, PTO2TaskId::make(0, i));
+    }
+
+    // All 5 should be found
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(t, result);
+    EXPECT_EQ(result.count, 5);
+
+    // Advance validity threshold: tasks 0-2 become stale
+    tmap.sync_validity(0, 3);
+
+    result.count = 0;
+    tmap.lookup(t, result);
+    EXPECT_EQ(result.count, 2) << "Only tasks 3,4 are valid after sync_validity(3)";
+}
+
+// ---------------------------------------------------------------------------
+// entry_valid with different rings: ring isolation
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, RingIsolation) {
+    uint32_t shapes[] = {100};
+    Tensor t = make_tensor_nd(0xE000, 1, shapes, nullptr, 0);
+
+    // Insert in ring 0 (task 0) and ring 1 (task 0)
+    tmap.insert(t, PTO2TaskId::make(0, 0));
+    tmap.insert(t, PTO2TaskId::make(1, 0));
+
+    // Invalidate ring 0's tasks but not ring 1's
+    tmap.sync_validity(0, 1);
+
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(t, result);
+
+    // Only ring 1's entry should remain valid
+    EXPECT_EQ(result.count, 1);
+    if (result.count == 1) {
+        EXPECT_EQ(result.entries[0].entry->producer_task_id.ring(), 1)
+            << "Ring 0's entry is invalidated; ring 1's entry survives";
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Multiple tensors at different addresses: no cross-contamination
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, DifferentAddressesIsolated) {
+    uint32_t shapes[] = {100};
+    Tensor t1 = make_tensor_nd(0xF000, 1, shapes, nullptr, 0);
+    Tensor t2 = make_tensor_nd(0xF100, 1, shapes, nullptr, 0);
+
+    tmap.insert(t1, PTO2TaskId::make(0, 0));
+    tmap.insert(t2, PTO2TaskId::make(0, 1));
+
+    PTO2LookupResult result1;
+    result1.count = 0;
+    tmap.lookup(t1, result1);
+    EXPECT_EQ(result1.count, 1);
+
+    PTO2LookupResult result2;
+    result2.count = 0;
+    tmap.lookup(t2, result2);
+    EXPECT_EQ(result2.count, 1);
+
+    // Each lookup only finds its own producer
+    if (result1.count == 1 && result2.count == 1) {
+        EXPECT_NE(
+            result1.entries[0].entry->producer_task_id.local(), result2.entries[0].entry->producer_task_id.local()
+        );
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Free list recycling: after cleanup, new inserts reuse freed entries
+// without exhausting the pool. Verified via observable behavior (pool
+// doesn't exhaust) rather than internal pool index inspection.
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, FreeListRecycling) {
+    uint32_t shapes[] = {100};
+    Tensor t = make_tensor_nd(0x10000, 1, shapes, nullptr, 0);
+
+    // Insert 60 entries (within window_size=64, no slot collision)
+    for (int i = 0; i < 60; i++) {
+        tmap.insert(t, PTO2TaskId::make(0, i));
+    }
+
+    // Cleanup all 60 (range 0..60 < window_size=64, no ABA)
+    tmap.cleanup_retired(0, 0, 60);
+    tmap.sync_validity(0, 60);
+
+    // Insert another 60 -- should succeed because freed entries are reused
+    for (int i = 60; i < 120; i++) {
+        tmap.insert(t, PTO2TaskId::make(0, i));
+    }
+
+    // Verify via lookup: all 60 new entries should be findable
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(t, result);
+    // Capped at PTO2_LOOKUP_MAX_RESULTS=16, but count > 0 proves entries exist
+    EXPECT_EQ(result.count, PTO2_LOOKUP_MAX_RESULTS) << "After cleanup+reinsert, new entries are findable -- "
+                                                        "free list recycling keeps the pool from exhausting";
+}
diff --git a/tests/ut/cpp/test_a5_pto2_fatal.cpp b/tests/ut/cpp/pto2_a5/test_a5_pto2_fatal.cpp
similarity index 90%
rename from tests/ut/cpp/test_a5_pto2_fatal.cpp
rename to tests/ut/cpp/pto2_a5/test_a5_pto2_fatal.cpp
index 83d9483b1..2346d1911 100644
--- a/tests/ut/cpp/test_a5_pto2_fatal.cpp
+++ b/tests/ut/cpp/pto2_a5/test_a5_pto2_fatal.cpp
@@ -41,6 +41,8 @@ struct FakeRuntime {
     std::string last_fatal_message;
 };
 
+static_assert(offsetof(FakeRuntime, ops) == 0);  // Guard: reinterpret_cast below assumes ops is first member.
+
 FakeRuntime *as_fake(PTO2Runtime *rt) { return reinterpret_cast<FakeRuntime *>(rt); }
 
 TaskOutputTensors fake_submit(PTO2Runtime *rt, const MixedKernels &, const Arg &) {
@@ -87,20 +89,20 @@ TaskOutputTensors fake_alloc_tensors(PTO2Runtime *rt, const Arg &) {
 }
 
 const PTO2RuntimeOps kFakeOps = {
-    fake_submit,
-    fake_scope_begin,
-    fake_scope_end,
-    fake_orchestration_done,
-    fake_is_fatal,
-    fake_report_fatal,
-    fake_log,
-    fake_log,
-    fake_log,
-    fake_log,
-    fake_log,
-    fake_get_tensor_data,
-    fake_set_tensor_data,
-    fake_alloc_tensors,
+    .submit_task = fake_submit,
+    .scope_begin = fake_scope_begin,
+    .scope_end = fake_scope_end,
+    .orchestration_done = fake_orchestration_done,
+    .is_fatal = fake_is_fatal,
+    .report_fatal = fake_report_fatal,
+    .log_error = fake_log,
+    .log_warn = fake_log,
+    .log_info = fake_log,
+    .log_debug = fake_log,
+    .log_always = fake_log,
+    .get_tensor_data = fake_get_tensor_data,
+    .set_tensor_data = fake_set_tensor_data,
+    .alloc_tensors = fake_alloc_tensors,
 };
 
 class RuntimeBindingGuard {
diff --git a/tests/ut/cpp/stubs/test_stubs.cpp b/tests/ut/cpp/stubs/test_stubs.cpp
new file mode 100644
index 000000000..b9593ed08
--- /dev/null
+++ b/tests/ut/cpp/stubs/test_stubs.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Link-time stubs for platform APIs used by runtime headers.
+ *
+ * Provides x86-compatible implementations of functions declared in
+ * platform headers (unified_log.h, device_time.h, common.h) so that
+ * runtime data structures can be unit-tested on CI runners without
+ * Ascend hardware or SDK.
+ */
+
+#include <chrono>
+#include <cstdarg>
+#include <cstdint>
+#include <cstdio>
+#include <stdexcept>
+#include <string>
+
+// =============================================================================
+// unified_log.h stubs (5 log-level functions)
+// =============================================================================
+
+extern "C" {
+
+void unified_log_error(const char *func, const char *fmt, ...) {
+    va_list args;
+    va_start(args, fmt);
+    fprintf(stderr, "[ERROR] %s: ", func);
+    vfprintf(stderr, fmt, args);
+    fprintf(stderr, "\n");
+    va_end(args);
+}
+
+void unified_log_warn(const char *func, const char *fmt, ...) {
+    va_list args;
+    va_start(args, fmt);
+    fprintf(stderr, "[WARN]  %s: ", func);
+    vfprintf(stderr, fmt, args);
+    fprintf(stderr, "\n");
+    va_end(args);
+}
+
+void unified_log_info(const char * /* func */, const char * /* fmt */, ...) {
+    // Suppress info in tests
+}
+
+void unified_log_debug(const char * /* func */, const char * /* fmt */, ...) {
+    // Suppress debug in tests
+}
+
+void unified_log_always(const char *func, const char *fmt, ...) {
+    va_list args;
+    va_start(args, fmt);
+    fprintf(stderr, "[ALWAYS] %s: ", func);
+    vfprintf(stderr, fmt, args);
+    fprintf(stderr, "\n");
+    va_end(args);
+}
+
+}  // extern "C"
+
+// =============================================================================
+// device_time.h stub
+// =============================================================================
+
+uint64_t get_sys_cnt_aicpu() {
+    auto now = std::chrono::steady_clock::now();
+    return static_cast<uint64_t>(std::chrono::duration_cast<std::chrono::nanoseconds>(now.time_since_epoch()).count());
+}
+
+// =============================================================================
+// common.h stubs (assert_impl, get_stacktrace, AssertionError)
+// =============================================================================
+
+std::string get_stacktrace(int /* skip_frames */) { return "<stacktrace not available in test stubs>"; }
+
+class AssertionError : public std::runtime_error {
+public:
+    AssertionError(const char *condition, const char *file, int line) :
+        std::runtime_error(std::string("Assertion failed: ") + condition + " at " + file + ":" + std::to_string(line)),
+        condition_(condition),
+        file_(file),
+        line_(line) {}
+
+    const char *condition() const { return condition_; }
+    const char *file() const { return file_; }
+    int line() const { return line_; }
+
+private:
+    const char *condition_;
+    const char *file_;
+    int line_;
+};
+
+[[noreturn]] void assert_impl(const char *condition, const char *file, int line) {
+    throw AssertionError(condition, file, line);
+}
diff --git a/tests/ut/cpp/test_helpers.h b/tests/ut/cpp/test_helpers.h
new file mode 100644
index 000000000..a4244c9a2
--- /dev/null
+++ b/tests/ut/cpp/test_helpers.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Shared test helper utilities for C++ unit tests.
+ *
+ * Provides convenience functions that initialize internal data structures
+ * from user-supplied buffers, avoiding direct field manipulation in tests.
+ */
+#pragma once
+
+#include "pto_scheduler.h"
+
+/**
+ * Initialize a ReadyQueue with a caller-provided slot buffer and start sequence.
+ *
+ * Unlike pto2_ready_queue_init() which malloc's its own buffer and starts at 0,
+ * this helper uses a stack-allocated buffer and supports arbitrary start sequences
+ * (needed for sequence-wrap tests).
+ */
+inline void
+test_ready_queue_init(PTO2ReadyQueue *queue, PTO2ReadyQueueSlot *slots, uint64_t capacity, int64_t start_seq = 0) {
+    queue->slots = slots;
+    queue->capacity = capacity;
+    queue->mask = capacity - 1;
+    queue->enqueue_pos.store(start_seq, std::memory_order_relaxed);
+    queue->dequeue_pos.store(start_seq, std::memory_order_relaxed);
+    for (uint64_t i = 0; i < capacity; i++) {
+        int64_t pos = start_seq + (int64_t)i;
+        uint64_t idx = (uint64_t)pos & (capacity - 1);
+        slots[idx].sequence.store(pos, std::memory_order_relaxed);
+        slots[idx].slot_state = nullptr;
+    }
+}
diff --git a/tests/ut/cpp/test_child_memory.cpp b/tests/ut/cpp/types/test_child_memory.cpp
similarity index 98%
rename from tests/ut/cpp/test_child_memory.cpp
rename to tests/ut/cpp/types/test_child_memory.cpp
index 2ac7073a2..418cfdc7c 100644
--- a/tests/ut/cpp/test_child_memory.cpp
+++ b/tests/ut/cpp/types/test_child_memory.cpp
@@ -20,6 +20,7 @@
 // ContinuousTensor layout
 // ---------------------------------------------------------------------------
 
+// ABI contract: size must match serialization format.
 TEST(ChildMemory, SizeofUnchanged) { EXPECT_EQ(sizeof(ContinuousTensor), 40u); }
 
 TEST(ChildMemory, DefaultIsZero) {
diff --git a/tests/ut/cpp/types/test_pto_types.cpp b/tests/ut/cpp/types/test_pto_types.cpp
new file mode 100644
index 000000000..95c9dec25
--- /dev/null
+++ b/tests/ut/cpp/types/test_pto_types.cpp
@@ -0,0 +1,407 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Unit tests for Arg and TaskOutputTensors from pto_types.h.
+ *
+ * Tests argument ordering enforcement, tensor/scalar storage,
+ * error propagation, add_scalars_i32 zero-extension, copy_scalars_from,
+ * and TaskOutputTensors materialization.
+ */
+
+#include <cstdint>
+#include <cstring>
+
+#include <gtest/gtest.h>
+
+#include "common.h"
+#include "pto_orchestration_api.h"
+#include "pto_types.h"
+
+// =============================================================================
+// Helpers
+// =============================================================================
+
+static Tensor make_test_tensor(void *buf) {
+    uint32_t shapes[2] = {4, 8};
+    return make_tensor_external(buf, shapes, 2, DataType::FLOAT32);
+}
+
+// =============================================================================
+// TaskOutputTensors
+// =============================================================================
+
+TEST(TaskOutputTensors, InitialState) {
+    TaskOutputTensors out;
+    EXPECT_TRUE(out.empty());
+    EXPECT_EQ(out.size(), 0u);
+}
+
+TEST(TaskOutputTensors, MaterializeAddsOne) {
+    float buf[4] = {};
+    Tensor t = make_test_tensor(buf);
+
+    TaskOutputTensors out;
+    out.materialize_output(t);
+
+    EXPECT_FALSE(out.empty());
+    EXPECT_EQ(out.size(), 1u);
+}
+
+TEST(TaskOutputTensors, GetRefReturnsCorrectTensor) {
+    float buf0[4] = {};
+    float buf1[4] = {};
+    Tensor t0 = make_test_tensor(buf0);
+    Tensor t1 = make_test_tensor(buf1);
+
+    TaskOutputTensors out;
+    out.materialize_output(t0);
+    out.materialize_output(t1);
+
+    EXPECT_EQ(&out.get_ref(0), &t0);
+    EXPECT_EQ(&out.get_ref(1), &t1);
+    EXPECT_EQ(out.size(), 2u);
+}
+
+TEST(TaskOutputTensors, GetRefOutOfRangeThrows) {
+    TaskOutputTensors out;
+    EXPECT_THROW(out.get_ref(0), AssertionError);
+}
+
+TEST(TaskOutputTensors, MaxOutputsFill) {
+    float bufs[PTO2_MAX_OUTPUTS] = {};
+    std::vector<Tensor> tensors;
+    tensors.reserve(PTO2_MAX_OUTPUTS);
+
+    TaskOutputTensors out;
+    for (int i = 0; i < PTO2_MAX_OUTPUTS; i++) {
+        tensors.push_back(make_test_tensor(&bufs[i]));
+        out.materialize_output(tensors.back());
+    }
+    EXPECT_EQ(out.size(), static_cast<uint32_t>(PTO2_MAX_OUTPUTS));
+}
+
+// =============================================================================
+// Arg -- initial state
+// =============================================================================
+
+TEST(Arg, DefaultState) {
+    Arg a;
+    EXPECT_FALSE(a.has_error);
+    EXPECT_EQ(a.error_msg, nullptr);
+    EXPECT_EQ(a.tensor_count(), 0);
+    EXPECT_EQ(a.scalar_count(), 0);
+}
+
+// =============================================================================
+// Arg -- add_input / add_output / add_inout
+// =============================================================================
+
+TEST(Arg, AddInput) {
+    float buf[4] = {};
+    Tensor t = make_test_tensor(buf);
+    Arg a;
+    a.add_input(t);
+    EXPECT_EQ(a.tensor_count(), 1);
+    EXPECT_EQ(a.tag(0), TensorArgType::INPUT);
+    EXPECT_EQ(a.tensor(0).ptr, &t);
+    EXPECT_FALSE(a.has_error);
+}
+
+TEST(Arg, AddOutput) {
+    uint32_t shapes[2] = {4, 8};
+    TensorCreateInfo ci(shapes, 2, DataType::FLOAT32);
+    Arg a;
+    a.add_output(ci);
+    EXPECT_EQ(a.tensor_count(), 1);
+    EXPECT_EQ(a.tag(0), TensorArgType::OUTPUT);
+    EXPECT_EQ(a.tensor(0).create_info, &ci);
+    EXPECT_FALSE(a.has_error);
+}
+
+TEST(Arg, AddInout) {
+    float buf[4] = {};
+    Tensor t = make_test_tensor(buf);
+    Arg a;
+    a.add_inout(t);
+    EXPECT_EQ(a.tensor_count(), 1);
+    EXPECT_EQ(a.tag(0), TensorArgType::INOUT);
+    EXPECT_EQ(a.tensor(0).ptr, &t);
+    EXPECT_FALSE(a.has_error);
+}
+
+TEST(Arg, MixedInputOutputInout) {
+    float buf_in[4] = {}, buf_inout[4] = {};
+    Tensor tin = make_test_tensor(buf_in);
+    Tensor tinout = make_test_tensor(buf_inout);
+    uint32_t shapes_in[2] = {4, 8};
+    TensorCreateInfo ci(shapes_in, 1, DataType::FLOAT32);
+
+    Arg a;
+    a.add_input(tin);
+    a.add_output(ci);
+    a.add_inout(tinout);
+
+    EXPECT_EQ(a.tensor_count(), 3);
+    EXPECT_EQ(a.tag(0), TensorArgType::INPUT);
+    EXPECT_EQ(a.tag(1), TensorArgType::OUTPUT);
+    EXPECT_EQ(a.tag(2), TensorArgType::INOUT);
+    EXPECT_FALSE(a.has_error);
+}
+
+// =============================================================================
+// Arg -- ordering enforcement: tensor after scalar sets error
+// =============================================================================
+
+TEST(Arg, TensorAfterScalarSetsError) {
+    float buf[4] = {};
+    Tensor t = make_test_tensor(buf);
+    Arg a;
+    a.add_scalar(uint64_t(42));
+    a.add_input(t);  // invalid: tensor after scalar
+    EXPECT_TRUE(a.has_error);
+    EXPECT_NE(a.error_msg, nullptr);
+    // The scalar was recorded, the tensor was not
+    EXPECT_EQ(a.tensor_count(), 0);
+    EXPECT_EQ(a.scalar_count(), 1);
+}
+
+TEST(Arg, OutputAfterScalarSetsError) {
+    uint32_t shapes_in[2] = {4, 8};
+    TensorCreateInfo ci(shapes_in, 1, DataType::FLOAT32);
+    Arg a;
+    a.add_scalar(uint64_t(1));
+    a.add_output(ci);
+    EXPECT_TRUE(a.has_error);
+    EXPECT_EQ(a.tensor_count(), 0);
+}
+
+TEST(Arg, InoutAfterScalarSetsError) {
+    float buf[4] = {};
+    Tensor t = make_test_tensor(buf);
+    Arg a;
+    a.add_scalar(uint64_t(1));
+    a.add_inout(t);
+    EXPECT_TRUE(a.has_error);
+    EXPECT_EQ(a.tensor_count(), 0);
+}
+
+// =============================================================================
+// Arg -- capacity limits
+// =============================================================================
+
+TEST(Arg, TensorCapacityExceeded) {
+    Arg a;
+    for (int i = 0; i < MAX_TENSOR_ARGS; i++) {
+        float dummy = 0.0f;
+        Tensor t = make_test_tensor(&dummy);
+        a.add_input(t);
+        ASSERT_FALSE(a.has_error) << "Failed at tensor " << i;
+    }
+    // One more should trigger the error
+    float extra = 0.0f;
+    Tensor t_extra = make_test_tensor(&extra);
+    a.add_input(t_extra);
+    EXPECT_TRUE(a.has_error);
+    EXPECT_EQ(a.tensor_count(), MAX_TENSOR_ARGS);
+}
+
+TEST(Arg, ScalarCapacityExceeded) {
+    Arg a;
+    for (int i = 0; i < MAX_SCALAR_ARGS; i++) {
+        a.add_scalar(static_cast<uint64_t>(i));
+        ASSERT_FALSE(a.has_error) << "Failed at scalar " << i;
+    }
+    a.add_scalar(uint64_t(999));
+    EXPECT_TRUE(a.has_error);
+    EXPECT_EQ(a.scalar_count(), MAX_SCALAR_ARGS);
+}
+
+// =============================================================================
+// Arg -- add_scalar with various types
+// =============================================================================
+
+TEST(Arg, AddScalarUint64) {
+    Arg a;
+    a.add_scalar(uint64_t(0xDEADBEEFCAFEBABEULL));
+    EXPECT_EQ(a.scalar_count(), 1);
+    EXPECT_EQ(a.scalar(0), 0xDEADBEEFCAFEBABEULL);
+    EXPECT_FALSE(a.has_error);
+}
+
+TEST(Arg, AddScalarFloat) {
+    Arg a;
+    float v = 3.14f;
+    a.add_scalar(v);
+    EXPECT_EQ(a.scalar_count(), 1);
+    EXPECT_EQ(a.scalar(0), to_u64(v));
+    EXPECT_FALSE(a.has_error);
+}
+
+TEST(Arg, AddScalarInt32) {
+    Arg a;
+    int32_t v = -7;
+    a.add_scalar(v);
+    EXPECT_EQ(a.scalar_count(), 1);
+    EXPECT_EQ(a.scalar(0), to_u64(v));
+    EXPECT_FALSE(a.has_error);
+}
+
+// =============================================================================
+// Arg -- add_scalars (batch uint64)
+// =============================================================================
+
+TEST(Arg, AddScalarsBatch) {
+    Arg a;
+    uint64_t vals[3] = {10, 20, 30};
+    a.add_scalars(vals, 3);
+    EXPECT_EQ(a.scalar_count(), 3);
+    EXPECT_EQ(a.scalar(0), 10u);
+    EXPECT_EQ(a.scalar(1), 20u);
+    EXPECT_EQ(a.scalar(2), 30u);
+    EXPECT_FALSE(a.has_error);
+}
+
+TEST(Arg, AddScalarsBatchOverCapacitySetsError) {
+    Arg a;
+    // Fill to capacity minus 1
+    for (int i = 0; i < MAX_SCALAR_ARGS - 1; i++) {
+        a.add_scalar(uint64_t(i));
+    }
+    // Batch of 3 would overflow by 2
+    uint64_t vals[3] = {1, 2, 3};
+    a.add_scalars(vals, 3);
+    EXPECT_TRUE(a.has_error);
+}
+
+// =============================================================================
+// Arg -- add_scalars_i32 (zero-extension)
+// =============================================================================
+
+TEST(Arg, AddScalarsI32ZeroExtends) {
+    Arg a;
+    int32_t vals[4] = {0, 1, -1, 0x7FFFFFFF};
+    a.add_scalars_i32(vals, 4);
+    EXPECT_EQ(a.scalar_count(), 4);
+    EXPECT_EQ(a.scalar(0), uint64_t(0));
+    EXPECT_EQ(a.scalar(1), uint64_t(1));
+    // -1 as int32 is 0xFFFFFFFF; zero-extended to uint64 is 0x00000000FFFFFFFF
+    EXPECT_EQ(a.scalar(2), uint64_t(0x00000000FFFFFFFFull));
+    EXPECT_EQ(a.scalar(3), uint64_t(0x000000007FFFFFFFull));
+    EXPECT_FALSE(a.has_error);
+}
+
+TEST(Arg, AddScalarsI32NegativeValues) {
+    Arg a;
+    int32_t vals[2] = {-1, -2};
+    a.add_scalars_i32(vals, 2);
+    // -1 -> 0xFFFFFFFF zero-extended -> 0x00000000FFFFFFFF
+    // -2 -> 0xFFFFFFFE zero-extended -> 0x00000000FFFFFFFE
+    EXPECT_EQ(a.scalar(0), uint64_t(0xFFFFFFFFull));
+    EXPECT_EQ(a.scalar(1), uint64_t(0xFFFFFFFEull));
+}
+
+TEST(Arg, AddScalarsI32SingleElement) {
+    Arg a;
+    int32_t v = 42;
+    a.add_scalars_i32(&v, 1);
+    EXPECT_EQ(a.scalar_count(), 1);
+    EXPECT_EQ(a.scalar(0), uint64_t(42));
+}
+
+TEST(Arg, AddScalarsI32OverCapacitySetsError) {
+    Arg a;
+    for (int i = 0; i < MAX_SCALAR_ARGS - 1; i++) {
+        a.add_scalar(uint64_t(i));
+    }
+    int32_t vals[3] = {1, 2, 3};
+    a.add_scalars_i32(vals, 3);
+    EXPECT_TRUE(a.has_error);
+}
+
+// =============================================================================
+// Arg -- copy_scalars_from
+// =============================================================================
+
+TEST(Arg, CopyScalarsFrom) {
+    Arg src;
+    src.add_scalar(uint64_t(10));
+    src.add_scalar(uint64_t(20));
+    src.add_scalar(uint64_t(30));
+
+    Arg dst;
+    dst.copy_scalars_from(src, 1, 2);  // copy scalars[1..2] = {20, 30}
+    EXPECT_EQ(dst.scalar_count(), 2);
+    EXPECT_EQ(dst.scalar(0), uint64_t(20));
+    EXPECT_EQ(dst.scalar(1), uint64_t(30));
+    EXPECT_FALSE(dst.has_error);
+}
+
+TEST(Arg, CopyScalarsFromOutOfBoundsSetsError) {
+    Arg src;
+    src.add_scalar(uint64_t(1));
+
+    Arg dst;
+    dst.copy_scalars_from(src, 0, 5);  // only 1 scalar available, request 5
+    EXPECT_TRUE(dst.has_error);
+}
+
+TEST(Arg, CopyScalarsFromFull) {
+    Arg src;
+    for (int i = 0; i < MAX_SCALAR_ARGS; i++) {
+        src.add_scalar(static_cast<uint64_t>(i));
+    }
+    Arg dst;
+    for (int i = 0; i < MAX_SCALAR_ARGS - 1; i++) {
+        dst.add_scalar(uint64_t(0));
+    }
+    // dst has MAX-1 scalars; copying 2 from src would overflow
+    dst.copy_scalars_from(src, 0, 2);
+    EXPECT_TRUE(dst.has_error);
+}
+
+// =============================================================================
+// Arg -- reset clears all state
+// =============================================================================
+
+TEST(Arg, ResetClearsAll) {
+    float buf[4] = {};
+    Tensor t = make_test_tensor(buf);
+    Arg a;
+    a.add_input(t);
+    a.add_scalar(uint64_t(99));
+    a.set_error("deliberate error");
+
+    a.reset();
+    EXPECT_EQ(a.tensor_count(), 0);
+    EXPECT_EQ(a.scalar_count(), 0);
+    EXPECT_FALSE(a.has_error);
+    EXPECT_EQ(a.error_msg, nullptr);
+}
+
+// =============================================================================
+// Arg -- set_error is idempotent (first error wins)
+// =============================================================================
+
+TEST(Arg, SetErrorFirstWins) {
+    Arg a;
+    a.set_error("first");
+    a.set_error("second");
+    EXPECT_STREQ(a.error_msg, "first");
+}
+
+// =============================================================================
+// Arg -- launch_spec default
+// =============================================================================
+
+TEST(Arg, LaunchSpecDefaultBlockNum) {
+    Arg a;
+    EXPECT_EQ(a.launch_spec.block_num(), 1);
+}
diff --git a/tests/ut/cpp/types/test_tensor.cpp b/tests/ut/cpp/types/test_tensor.cpp
new file mode 100644
index 000000000..f73f9de78
--- /dev/null
+++ b/tests/ut/cpp/types/test_tensor.cpp
@@ -0,0 +1,440 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Unit tests for Tensor and related types in tensor.h
+ *
+ * Tests Tensor operations, TensorCreateInfo, Segment intersection,
+ * and boundary conditions in cache-line layout coupling.
+ */
+
+#include <gtest/gtest.h>
+
+#include <cstring>
+
+#include "pto_orchestration_api.h"
+
+// Helper: create a Tensor via make_tensor_external (the public factory)
+static Tensor make_test_tensor(
+    void *addr, const uint32_t shapes[], uint32_t ndims, DataType dtype = DataType::FLOAT32, bool manual_dep = false,
+    int32_t version = 0
+) {
+    return make_tensor_external(addr, shapes, ndims, dtype, manual_dep, version);
+}
+
+// =============================================================================
+// Segment intersection
+// =============================================================================
+
+TEST(Segment, OverlappingIntersects) {
+    Segment a{0, 10};
+    Segment b{5, 15};
+    EXPECT_TRUE(a.line_segment_intersection(b));
+    EXPECT_TRUE(b.line_segment_intersection(a));
+}
+
+TEST(Segment, TouchingDoesNotIntersect) {
+    Segment a{0, 10};
+    Segment b{10, 20};
+    EXPECT_FALSE(a.line_segment_intersection(b));
+    EXPECT_FALSE(b.line_segment_intersection(a));
+}
+
+TEST(Segment, DisjointDoesNotIntersect) {
+    Segment a{0, 5};
+    Segment b{10, 20};
+    EXPECT_FALSE(a.line_segment_intersection(b));
+    EXPECT_FALSE(b.line_segment_intersection(a));
+}
+
+TEST(Segment, ZeroLengthAtBoundary) {
+    // Zero-length segment at position 10 touching [0,10)
+    Segment a{10, 10};
+    Segment b{0, 10};
+    EXPECT_FALSE(a.line_segment_intersection(b));
+}
+
+TEST(Segment, ZeroLengthInsideRange) {
+    // Zero-length segment at position 5 inside [0,10)
+    // end(5) > other.begin(0) && other.end(10) > begin(5) => true
+    // KNOWN BEHAVIOR: zero-length segments report intersection.
+    // This could cause spurious dependencies in TensorMap overlap detection.
+    Segment a{5, 5};
+    Segment b{0, 10};
+    EXPECT_TRUE(a.line_segment_intersection(b));
+}
+
+TEST(Segment, IdenticalRanges) {
+    Segment a{0, 10};
+    EXPECT_TRUE(a.line_segment_intersection(a));
+}
+
+TEST(Segment, ContainsFull) {
+    Segment outer{0, 20};
+    Segment inner{5, 10};
+    EXPECT_TRUE(outer.contains(inner));
+}
+
+TEST(Segment, ContainsIdentical) {
+    Segment a{0, 10};
+    EXPECT_TRUE(a.contains(a));
+}
+
+TEST(Segment, DoesNotContainPartial) {
+    Segment a{0, 10};
+    Segment b{5, 15};
+    EXPECT_FALSE(a.contains(b));
+}
+
+TEST(Segment, ContainsAtBoundary) {
+    Segment a{0, 10};
+    Segment b{0, 10};
+    EXPECT_TRUE(a.contains(b));
+}
+
+// =============================================================================
+// TensorCreateInfo
+// =============================================================================
+
+TEST(TensorCreateInfo, BufferSizeBytes) {
+    uint32_t shapes[] = {4, 8};
+    TensorCreateInfo ci(shapes, 2, DataType::FLOAT32);
+    EXPECT_EQ(ci.buffer_size_bytes(), 4u * 8u * 4u);  // 4*8 elements * 4 bytes
+}
+
+TEST(TensorCreateInfo, BufferSizeBytesInt8) {
+    uint32_t shapes[] = {10, 20, 30};
+    TensorCreateInfo ci(shapes, 3, DataType::INT8);
+    EXPECT_EQ(ci.buffer_size_bytes(), 10u * 20u * 30u * 1u);
+}
+
+// ABI contract: size must match device DMA requirements.
+TEST(TensorCreateInfo, SizeIs64Bytes) { EXPECT_EQ(sizeof(TensorCreateInfo), 64u); }
+
+TEST(TensorCreateInfo, InitialValueDefault) {
+    uint32_t shapes[] = {4};
+    TensorCreateInfo ci(shapes, 1);
+    EXPECT_FALSE(ci.has_initial_value);
+}
+
+TEST(TensorCreateInfo, SetInitialValue) {
+    uint32_t shapes[] = {4};
+    TensorCreateInfo ci(shapes, 1);
+    ci.set_initial_value<float>(3.14f);
+    EXPECT_TRUE(ci.has_initial_value);
+}
+
+// =============================================================================
+// Tensor basic operations
+// =============================================================================
+
+// ABI contract: size must match device DMA requirements.
+TEST(Tensor, SizeIs128Bytes) { EXPECT_EQ(sizeof(Tensor), 128u); }
+
+// ABI contract: offset must match device DMA requirements.
+TEST(Tensor, RawShapesAtOffset64) { EXPECT_EQ(offsetof(Tensor, raw_shapes), 64u); }
+
+TEST(Tensor, MakeExternal) {
+    char buf[256];
+    uint32_t shapes[] = {4, 8};
+    auto t = make_test_tensor(buf, shapes, 2);
+    EXPECT_EQ(t.buffer.addr, reinterpret_cast<uint64_t>(buf));
+    EXPECT_EQ(t.ndims, 2u);
+    EXPECT_EQ(t.shapes[0], 4u);
+    EXPECT_EQ(t.shapes[1], 8u);
+}
+
+TEST(Tensor, Numel) {
+    char buf[256];
+    uint32_t shapes[] = {4, 8, 2};
+    auto t = make_test_tensor(buf, shapes, 3);
+    EXPECT_EQ(t.numel(), 64u);
+}
+
+TEST(Tensor, NumelZeroDim) {
+    char buf[256];
+    uint32_t shapes[] = {};
+    auto t = make_test_tensor(buf, shapes, 0);
+    EXPECT_EQ(t.numel(), 0u);
+}
+
+TEST(Tensor, IsContiguousWhenRawEqShapes) {
+    char buf[256];
+    uint32_t shapes[] = {4, 8};
+    auto t = make_test_tensor(buf, shapes, 2);
+    EXPECT_TRUE(t.is_raw_eq_shapes);
+    EXPECT_TRUE(t.is_contiguous());
+}
+
+TEST(Tensor, IsSameMemref) {
+    char buf1[256], buf2[256];
+    uint32_t shapes[] = {4};
+    auto t1 = make_test_tensor(buf1, shapes, 1);
+    auto t2 = make_test_tensor(buf1, shapes, 1);
+    auto t3 = make_test_tensor(buf2, shapes, 1);
+    EXPECT_TRUE(t1.is_same_memref(t2));
+    EXPECT_FALSE(t1.is_same_memref(t3));
+}
+
+// =============================================================================
+// View
+// =============================================================================
+
+TEST(Tensor, ViewWithZeroOffsets) {
+    char buf[256];
+    uint32_t shapes[] = {10, 20};
+    auto parent = make_test_tensor(buf, shapes, 2);
+
+    uint32_t view_shapes[] = {5, 10};
+    uint32_t view_offsets[] = {0, 0};
+    auto v = parent.view(view_shapes, view_offsets);
+
+    EXPECT_EQ(v.shapes[0], 5u);
+    EXPECT_EQ(v.shapes[1], 10u);
+    EXPECT_TRUE(v.is_all_offset_zero);
+    EXPECT_EQ(v.buffer.addr, parent.buffer.addr);
+}
+
+TEST(Tensor, ViewWithNonZeroOffsets) {
+    char buf[256];
+    uint32_t shapes[] = {10, 20};
+    auto parent = make_test_tensor(buf, shapes, 2);
+
+    uint32_t view_shapes[] = {5, 10};
+    uint32_t view_offsets[] = {2, 3};
+    auto v = parent.view(view_shapes, view_offsets);
+
+    EXPECT_EQ(v.shapes[0], 5u);
+    EXPECT_EQ(v.shapes[1], 10u);
+    EXPECT_FALSE(v.is_all_offset_zero);
+    EXPECT_EQ(v.offsets[0], 2u);
+    EXPECT_EQ(v.offsets[1], 3u);
+}
+
+TEST(Tensor, ViewOffsetAccumulation) {
+    char buf[256];
+    uint32_t shapes[] = {20, 30};
+    auto parent = make_test_tensor(buf, shapes, 2);
+
+    // First view with offsets
+    uint32_t v1_shapes[] = {10, 15};
+    uint32_t v1_offsets[] = {5, 10};
+    auto v1 = parent.view(v1_shapes, v1_offsets);
+
+    // Second view on top of first
+    uint32_t v2_shapes[] = {3, 4};
+    uint32_t v2_offsets[] = {1, 2};
+    auto v2 = v1.view(v2_shapes, v2_offsets);
+
+    EXPECT_EQ(v2.offsets[0], 6u);   // 5 + 1
+    EXPECT_EQ(v2.offsets[1], 12u);  // 10 + 2
+}
+
+// =============================================================================
+// Reshape
+// =============================================================================
+
+TEST(Tensor, ReshapeContiguous) {
+    char buf[256];
+    uint32_t shapes[] = {4, 8};
+    auto t = make_test_tensor(buf, shapes, 2);
+
+    uint32_t new_shapes[] = {32};
+    auto r = t.reshape(new_shapes, 1);
+
+    EXPECT_EQ(r.numel(), 32u);
+    EXPECT_EQ(r.ndims, 1u);
+    EXPECT_EQ(r.shapes[0], 32u);
+    EXPECT_TRUE(r.is_raw_eq_shapes);
+    EXPECT_TRUE(r.is_all_offset_zero);
+}
+
+TEST(Tensor, ReshapePreservesBuffer) {
+    char buf[256];
+    uint32_t shapes[] = {4, 8};
+    auto t = make_test_tensor(buf, shapes, 2);
+
+    uint32_t new_shapes[] = {2, 16};
+    auto r = t.reshape(new_shapes, 2);
+
+    EXPECT_EQ(r.buffer.addr, t.buffer.addr);
+}
+
+// =============================================================================
+// Transpose
+// =============================================================================
+
+TEST(Tensor, TransposeSwapsDims) {
+    char buf[256];
+    uint32_t shapes[] = {4, 8, 2};
+    auto t = make_test_tensor(buf, shapes, 3);
+
+    auto tr = t.transpose(0, 2);
+
+    EXPECT_EQ(tr.shapes[0], 2u);
+    EXPECT_EQ(tr.shapes[1], 8u);
+    EXPECT_EQ(tr.shapes[2], 4u);
+    EXPECT_EQ(tr.numel(), t.numel());
+}
+
+// =============================================================================
+// compute_flat_offset
+// =============================================================================
+
+TEST(Tensor, ComputeFlatOffsetZeroDim) {
+    char buf[256];
+    uint32_t shapes[] = {};
+    auto t = make_test_tensor(buf, shapes, 0);
+    uint32_t indices[] = {};
+    EXPECT_EQ(t.compute_flat_offset(indices, 0), 0u);
+}
+
+TEST(Tensor, ComputeFlatOffset1D) {
+    char buf[256];
+    uint32_t shapes[] = {10};
+    auto t = make_test_tensor(buf, shapes, 1);
+    uint32_t indices[] = {7};
+    EXPECT_EQ(t.compute_flat_offset(indices, 1), 7u);
+}
+
+TEST(Tensor, ComputeFlatOffset2D) {
+    char buf[256];
+    uint32_t shapes[] = {4, 8};
+    auto t = make_test_tensor(buf, shapes, 2);
+    // Row-major: offset = i0 * 8 + i1 = 2*8+3 = 19
+    uint32_t indices[] = {2, 3};
+    EXPECT_EQ(t.compute_flat_offset(indices, 2), 19u);
+}
+
+// =============================================================================
+// update_start_offset
+// =============================================================================
+
+TEST(Tensor, UpdateStartOffsetZeroOffsets) {
+    char buf[256];
+    uint32_t shapes[] = {4, 8};
+    auto t = make_test_tensor(buf, shapes, 2);
+    t.update_start_offset();
+    EXPECT_EQ(t.start_offset, 0u);
+}
+
+// =============================================================================
+// fill_initial_value
+// =============================================================================
+
+TEST(Tensor, FillInitialValue) {
+    alignas(64) char buf[128];
+    memset(buf, 0, sizeof(buf));
+
+    uint32_t shapes[] = {32};
+    TensorCreateInfo ci(shapes, 1, DataType::FLOAT32);
+    ci.set_initial_value<float>(1.0f);
+
+    // Use make_tensor_external then overwrite with init_from_create_info
+    auto t = make_tensor_external(buf, shapes, 1);
+    t.init_from_create_info(ci, buf, sizeof(buf));
+
+    // Check that the buffer was filled with 1.0f
+    float *data = reinterpret_cast<float *>(buf);
+    for (int i = 0; i < 32; i++) {
+        EXPECT_FLOAT_EQ(data[i], 1.0f) << "Mismatch at index " << i;
+    }
+}
+
+// =============================================================================
+// Layout coupling: TensorCreateInfo <-> Tensor cacheline 1
+// =============================================================================
+
+// ABI contract: TensorCreateInfo layout must match Tensor cacheline 1 for DMA.
+TEST(LayoutCoupling, TensorCreateInfoMatchesTensor) {
+    // These static_asserts are in tensor.h but we verify they compile here
+    static_assert(offsetof(TensorCreateInfo, version) == offsetof(Tensor, version));
+    static_assert(offsetof(TensorCreateInfo, dtype) == offsetof(Tensor, dtype));
+    static_assert(offsetof(TensorCreateInfo, ndims) == offsetof(Tensor, ndims));
+    static_assert(offsetof(TensorCreateInfo, is_all_offset_zero) == offsetof(Tensor, is_all_offset_zero));
+    SUCCEED();
+}
+
+// =============================================================================
+// Tensor overlap tests
+//
+// Helper: build a 1D tensor through make_tensor_external (Tensor's ctor is
+// private), then override buffer.size / offsets[0] / version to script the
+// scenarios these tests exercise.
+// =============================================================================
+
+static Tensor
+make_1d_tensor(uint64_t addr, uint64_t buf_size, uint32_t shape, uint32_t offset = 0, int32_t version = 0) {
+    uint32_t shapes[] = {shape, 0, 0, 0, 0};
+    Tensor t = make_tensor_external(
+        reinterpret_cast<void *>(addr), shapes, 1, DataType::FLOAT32, /*manual_dep=*/false, version
+    );
+    t.buffer.size = buf_size;
+    // make_tensor_external leaves offsets[] uninitialized when is_all_offset_zero,
+    // so set it here regardless of zero/non-zero value.
+    t.offsets[0] = offset;
+    t.is_all_offset_zero = (offset == 0);
+    return t;
+}
+
+TEST(TensorOverlap, NoOverlap_DifferentAddr) {
+    Tensor a = make_1d_tensor(0x100, 400, 100);
+    Tensor b = make_1d_tensor(0x200, 400, 100);
+    EXPECT_NE(a.buffer.addr, b.buffer.addr);
+}
+
+TEST(TensorOverlap, FullOverlap_Identical) {
+    Tensor a = make_1d_tensor(0x100, 400, 100, 0, 0);
+    Tensor b = make_1d_tensor(0x100, 400, 100, 0, 0);
+    EXPECT_EQ(a.buffer.addr, b.buffer.addr);
+    EXPECT_EQ(a.shapes[0], b.shapes[0]);
+    EXPECT_EQ(a.offsets[0], b.offsets[0]);
+}
+
+TEST(TensorOverlap, PartialOverlap_1D) {
+    // [0:100] vs [50:150] -- partial overlap
+    Tensor a = make_1d_tensor(0x100, 600, 100, 0, 0);
+    Tensor b = make_1d_tensor(0x100, 600, 100, 50, 0);
+    EXPECT_EQ(a.buffer.addr, b.buffer.addr);
+    EXPECT_NE(a.offsets[0], b.offsets[0]);
+}
+
+TEST(TensorOverlap, Contained_Subset) {
+    // [10:20] is within [0:100]
+    Tensor big = make_1d_tensor(0x100, 400, 100, 0, 0);
+    Tensor small = make_1d_tensor(0x100, 400, 10, 10, 0);
+    EXPECT_EQ(big.buffer.addr, small.buffer.addr);
+    Segment big_seg{0, 100};
+    Segment small_seg{10, 20};
+    EXPECT_TRUE(big_seg.contains(small_seg));
+}
+
+TEST(TensorOverlap, NoOverlap_Adjacent) {
+    // [0:100] vs [100:200] -- adjacent, no overlap
+    Segment a{0, 100};
+    Segment b{100, 200};
+    EXPECT_FALSE(a.line_segment_intersection(b));
+}
+
+TEST(TensorOverlap, TensorInitFields) {
+    uint32_t shapes[] = {10, 20, 0, 0, 0};
+    Tensor t = make_tensor_external(
+        reinterpret_cast<void *>(0x1000), shapes, 2, DataType::FLOAT32, /*manual_dep=*/false,
+        /*version=*/5
+    );
+    EXPECT_EQ(t.buffer.addr, 0x1000u);
+    EXPECT_EQ(t.buffer.size, 800u);
+    EXPECT_EQ(t.ndims, 2u);
+    EXPECT_EQ(t.version, 5);
+    EXPECT_EQ(t.shapes[0], 10u);
+    EXPECT_EQ(t.shapes[1], 20u);
+    EXPECT_TRUE(t.is_all_offset_zero);
+    EXPECT_TRUE(t.is_raw_eq_shapes);
+}
diff --git a/tests/ut/py/conftest.py b/tests/ut/py/conftest.py
index e3936c3c0..7de3b3bb3 100644
--- a/tests/ut/py/conftest.py
+++ b/tests/ut/py/conftest.py
@@ -8,15 +8,33 @@
 # -----------------------------------------------------------------------------------------------------------
 """Pytest configuration for Python unit tests (tests/ut/py/).
 
-Adds project directories to sys.path so that simpler_setup, task_interface,
-and host_worker modules are importable without installing the package.
+Adds project directories to sys.path so that:
+- ``import simpler_setup`` works (PROJECT_ROOT on path)
+- ``from simpler import env_manager`` works (python/ on path)
+- legacy ``import env_manager`` works (python/simpler/ on path)
 """
 
 import sys
 from pathlib import Path
 
-_ROOT = Path(__file__).resolve().parent.parent.parent.parent
-for _d in [_ROOT, _ROOT / "python"]:
+import pytest
+
+_ROOT = Path(__file__).parent.parent.parent.parent
+
+# Order matters: PROJECT_ROOT first (so ``import simpler_setup`` works as a
+# package), then python/ so ``from simpler import env_manager`` resolves, then
+# python/simpler/ so legacy ``import env_manager`` works.
+for _d in [
+    _ROOT,
+    _ROOT / "python",
+    _ROOT / "python" / "simpler",
+]:
     _s = str(_d)
     if _s not in sys.path:
         sys.path.insert(0, _s)
+
+
+@pytest.fixture
+def project_root():
+    """Return the project root directory."""
+    return _ROOT
diff --git a/tests/ut/py/test_elf_parser.py b/tests/ut/py/test_elf_parser.py
new file mode 100644
index 000000000..a2fffe53d
--- /dev/null
+++ b/tests/ut/py/test_elf_parser.py
@@ -0,0 +1,209 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Tests for python/elf_parser.py - ELF64 and Mach-O .text extraction."""
+
+import struct
+import tempfile
+
+import pytest
+
+from simpler_setup.elf_parser import _extract_cstring, extract_text_section
+
+
+def _build_elf64_with_text(text_data: bytes) -> bytes:
+    """Build a minimal ELF64 .o file with a .text section."""
+    # String table: \0.text\0.shstrtab\0
+    strtab = b"\x00.text\x00.shstrtab\x00"
+    text_name_offset = 1  # offset of ".text" in strtab
+    shstrtab_name_offset = 7  # offset of ".shstrtab" in strtab
+
+    # ELF header (64 bytes)
+    e_shoff = 64  # section headers right after ELF header
+    e_shnum = 3  # null + .text + .shstrtab
+    e_shstrndx = 2  # .shstrtab is section 2
+
+    elf_header = bytearray(64)
+    elf_header[0:4] = b"\x7fELF"
+    elf_header[4] = 2  # 64-bit
+    elf_header[5] = 1  # little-endian
+    elf_header[6] = 1  # version
+    struct.pack_into("<H", elf_header, 18, 1)  # e_type = ET_REL
+    struct.pack_into("<H", elf_header, 52, 64)  # e_shentsize
+    struct.pack_into("<Q", elf_header, 40, e_shoff)
+    struct.pack_into("<H", elf_header, 60, e_shnum)
+    struct.pack_into("<H", elf_header, 62, e_shstrndx)
+
+    # Data follows section headers: text_data then strtab
+    data_offset = e_shoff + 64 * e_shnum  # after headers
+    text_offset = data_offset
+    strtab_offset = text_offset + len(text_data)
+
+    # Section headers (64 bytes each)
+    # Section 0: null
+    sh_null = bytearray(64)
+
+    # Section 1: .text
+    sh_text = bytearray(64)
+    struct.pack_into("<I", sh_text, 0, text_name_offset)  # sh_name
+    struct.pack_into("<I", sh_text, 4, 1)  # SHT_PROGBITS
+    struct.pack_into("<Q", sh_text, 24, text_offset)  # sh_offset
+    struct.pack_into("<Q", sh_text, 32, len(text_data))  # sh_size
+
+    # Section 2: .shstrtab
+    sh_strtab = bytearray(64)
+    struct.pack_into("<I", sh_strtab, 0, shstrtab_name_offset)  # sh_name
+    struct.pack_into("<I", sh_strtab, 4, 3)  # SHT_STRTAB
+    struct.pack_into("<Q", sh_strtab, 24, strtab_offset)  # sh_offset
+    struct.pack_into("<Q", sh_strtab, 32, len(strtab))  # sh_size
+
+    return bytes(elf_header) + bytes(sh_null) + bytes(sh_text) + bytes(sh_strtab) + text_data + strtab
+
+
+def _build_macho64_with_text(text_data: bytes) -> bytes:
+    """Build a minimal Mach-O 64-bit .o file with __text section."""
+    # Header (32 bytes)
+    header = bytearray(32)
+    struct.pack_into("<I", header, 0, 0xFEEDFACF)  # magic
+    struct.pack_into("<I", header, 4, 0x0100000C)  # cputype (ARM64)
+    struct.pack_into("<I", header, 12, 1)  # filetype MH_OBJECT
+    struct.pack_into("<I", header, 16, 1)  # ncmds
+
+    # LC_SEGMENT_64 command
+    segment_header = bytearray(72)
+    struct.pack_into("<I", segment_header, 0, 0x19)  # LC_SEGMENT_64
+
+    # One section: __text
+    section = bytearray(80)
+    section[0:6] = b"__text"
+    section[16:22] = b"__TEXT"
+
+    text_offset = 32 + 72 + 80  # after header + segment + section
+    struct.pack_into("<Q", section, 40, len(text_data))  # size
+    struct.pack_into("<I", section, 48, text_offset)  # offset
+
+    cmdsize = 72 + 80
+    struct.pack_into("<I", segment_header, 4, cmdsize)  # cmdsize
+    struct.pack_into("<I", segment_header, 64, 1)  # nsects
+    struct.pack_into("<I", header, 20, cmdsize)  # sizeofcmds
+
+    return bytes(header) + bytes(segment_header) + bytes(section) + text_data
+
+
+# =============================================================================
+# ELF64 tests
+# =============================================================================
+
+
+class TestELF64:
+    def test_extract_text(self):
+        text_data = b"\x01\x02\x03\x04\x05"
+        elf = _build_elf64_with_text(text_data)
+        result = extract_text_section(elf)
+        assert result == text_data
+
+    def test_missing_text_section(self):
+        # Build ELF with only null + .shstrtab (no .text)
+        strtab = b"\x00.shstrtab\x00"
+        e_shoff = 64
+        e_shnum = 2
+        e_shstrndx = 1
+
+        elf_header = bytearray(64)
+        elf_header[0:4] = b"\x7fELF"
+        elf_header[4] = 2
+        elf_header[5] = 1
+        elf_header[6] = 1
+        struct.pack_into("<Q", elf_header, 40, e_shoff)
+        struct.pack_into("<H", elf_header, 60, e_shnum)
+        struct.pack_into("<H", elf_header, 62, e_shstrndx)
+
+        data_offset = e_shoff + 64 * e_shnum
+        sh_null = bytearray(64)
+        sh_strtab = bytearray(64)
+        struct.pack_into("<I", sh_strtab, 0, 1)
+        struct.pack_into("<I", sh_strtab, 4, 3)
+        struct.pack_into("<Q", sh_strtab, 24, data_offset)
+        struct.pack_into("<Q", sh_strtab, 32, len(strtab))
+
+        elf = bytes(elf_header) + bytes(sh_null) + bytes(sh_strtab) + strtab
+        with pytest.raises(ValueError, match=".text section not found"):
+            extract_text_section(elf)
+
+    def test_truncated_header(self):
+        with pytest.raises(ValueError):
+            extract_text_section(b"\x7fELF" + b"\x00" * 10)
+
+
+# =============================================================================
+# Mach-O tests
+# =============================================================================
+
+
+class TestMachO:
+    def test_extract_text(self):
+        text_data = b"\xaa\xbb\xcc\xdd"
+        macho = _build_macho64_with_text(text_data)
+        result = extract_text_section(macho)
+        assert result == text_data
+
+    def test_missing_text_section(self):
+        # Header with no sections
+        header = bytearray(32)
+        struct.pack_into("<I", header, 0, 0xFEEDFACF)
+        struct.pack_into("<I", header, 16, 0)  # ncmds = 0
+        with pytest.raises(ValueError, match="__text section not found"):
+            extract_text_section(bytes(header))
+
+
+# =============================================================================
+# Format detection
+# =============================================================================
+
+
+class TestFormatDetection:
+    def test_unknown_format(self):
+        with pytest.raises(ValueError, match="Not a valid ELF or Mach-O"):
+            extract_text_section(b"\x00\x00\x00\x00" + b"\x00" * 60)
+
+    def test_too_small(self):
+        with pytest.raises(ValueError, match="too small"):
+            extract_text_section(b"\x01\x02")
+
+    def test_file_path(self):
+        text_data = b"\xde\xad\xbe\xef"
+        elf = _build_elf64_with_text(text_data)
+        with tempfile.NamedTemporaryFile(suffix=".o", delete=False) as f:
+            f.write(elf)
+            f.flush()
+            result = extract_text_section(f.name)
+            assert result == text_data
+
+    def test_file_not_found(self):
+        with pytest.raises(FileNotFoundError):
+            extract_text_section("/nonexistent/path.o")
+
+
+# =============================================================================
+# _extract_cstring
+# =============================================================================
+
+
+class TestExtractCString:
+    def test_basic(self):
+        data = b"hello\x00world\x00"
+        assert _extract_cstring(data, 0) == "hello"
+        assert _extract_cstring(data, 6) == "world"
+
+    def test_no_null_terminator(self):
+        data = b"unterminated"
+        assert _extract_cstring(data, 0) == "unterminated"
+
+    def test_empty_string(self):
+        data = b"\x00rest"
+        assert _extract_cstring(data, 0) == ""
diff --git a/tests/ut/py/test_env_manager.py b/tests/ut/py/test_env_manager.py
new file mode 100644
index 000000000..22b181375
--- /dev/null
+++ b/tests/ut/py/test_env_manager.py
@@ -0,0 +1,65 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Tests for python/env_manager.py - environment variable cache."""
+
+import pytest
+from simpler import env_manager
+
+
+@pytest.fixture(autouse=True)
+def _reset_cache():
+    """Reset env_manager cache between tests."""
+    env_manager._cache.clear()
+    yield
+    env_manager._cache.clear()
+
+
+class TestGet:
+    def test_uncached_returns_none(self):
+        assert env_manager.get("NONEXISTENT_VAR_12345") is None
+
+    def test_after_ensure(self, monkeypatch):
+        monkeypatch.setenv("TEST_ENV_VAR_XYZ", "hello")
+        env_manager.ensure("TEST_ENV_VAR_XYZ")
+        assert env_manager.get("TEST_ENV_VAR_XYZ") == "hello"
+
+
+class TestEnsure:
+    def test_returns_value_when_set(self, monkeypatch):
+        monkeypatch.setenv("TEST_ENSURE_VAR", "value123")
+        result = env_manager.ensure("TEST_ENSURE_VAR")
+        assert result == "value123"
+
+    def test_raises_when_unset(self, monkeypatch):
+        monkeypatch.delenv("UNSET_VAR_99999", raising=False)
+        with pytest.raises(EnvironmentError, match="not set"):
+            env_manager.ensure("UNSET_VAR_99999")
+
+    def test_raises_when_empty(self, monkeypatch):
+        monkeypatch.setenv("EMPTY_VAR_TEST", "")
+        with pytest.raises(EnvironmentError, match="not set"):
+            env_manager.ensure("EMPTY_VAR_TEST")
+
+    def test_caching(self, monkeypatch):
+        monkeypatch.setenv("CACHED_VAR", "original")
+        env_manager.ensure("CACHED_VAR")
+
+        # Change the env var - cached value should persist
+        monkeypatch.setenv("CACHED_VAR", "changed")
+        result = env_manager.ensure("CACHED_VAR")
+        assert result == "original"  # Returns cached, not re-read
+
+    def test_caching_skips_none_check(self, monkeypatch):
+        monkeypatch.setenv("CACHE_TEST_2", "val")
+        env_manager.ensure("CACHE_TEST_2")
+
+        # Even if we remove from env, cache returns the value
+        monkeypatch.delenv("CACHE_TEST_2")
+        result = env_manager.ensure("CACHE_TEST_2")
+        assert result == "val"
diff --git a/tests/ut/py/test_kernel_compiler.py b/tests/ut/py/test_kernel_compiler.py
new file mode 100644
index 000000000..5b0952442
--- /dev/null
+++ b/tests/ut/py/test_kernel_compiler.py
@@ -0,0 +1,186 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Unit tests for python/kernel_compiler.py -- Kernel and orchestration compilation."""
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+from simpler import env_manager
+
+# =============================================================================
+# Fixtures
+# =============================================================================
+
+
+@pytest.fixture(autouse=True)
+def _clear_env_manager_cache():
+    """Clear env_manager cache before each test."""
+    env_manager._cache.clear()
+    yield
+    env_manager._cache.clear()
+
+
+@pytest.fixture
+def mock_ascend_home(tmp_path):
+    """Set up fake ASCEND_HOME_PATH with compiler stubs."""
+    ascend = tmp_path / "ascend"
+    (ascend / "bin" / "ccec").mkdir(parents=True)
+    (ascend / "bin" / "ccec").rmdir()
+    (ascend / "bin").mkdir(parents=True, exist_ok=True)
+    (ascend / "bin" / "ccec").touch()
+    (ascend / "bin" / "ld.lld").touch()
+    (ascend / "tools" / "hcc" / "bin").mkdir(parents=True)
+    (ascend / "tools" / "hcc" / "bin" / "aarch64-target-linux-gnu-g++").touch()
+    (ascend / "tools" / "hcc" / "bin" / "aarch64-target-linux-gnu-gcc").touch()
+    env_manager._cache["ASCEND_HOME_PATH"] = str(ascend)
+    return str(ascend)
+
+
+@pytest.fixture
+def sim_compiler(tmp_path):
+    """Create a KernelCompiler for a2a3sim (no ASCEND_HOME_PATH needed)."""
+    env_manager._cache["ASCEND_HOME_PATH"] = None
+    from simpler_setup.kernel_compiler import KernelCompiler  # noqa: PLC0415
+
+    return KernelCompiler(platform="a2a3sim")
+
+
+# =============================================================================
+# Platform include directory tests
+# =============================================================================
+
+
+class TestPlatformIncludeDirs:
+    """Tests for get_platform_include_dirs()."""
+
+    def test_a2a3sim_include_dirs(self, sim_compiler):
+        """a2a3sim platform include dirs point to a2a3/platform/include."""
+        dirs = sim_compiler.get_platform_include_dirs()
+        assert len(dirs) >= 1
+        assert any("a2a3" in d and "platform" in d and "include" in d for d in dirs)
+
+    def test_a5sim_include_dirs(self):
+        """a5sim platform include dirs point to a5/platform/include."""
+        env_manager._cache["ASCEND_HOME_PATH"] = None
+        from simpler_setup.kernel_compiler import KernelCompiler  # noqa: PLC0415
+
+        kc = KernelCompiler(platform="a5sim")
+        dirs = kc.get_platform_include_dirs()
+        assert any("a5" in d and "platform" in d and "include" in d for d in dirs)
+
+
+# =============================================================================
+# Orchestration include directory tests
+# =============================================================================
+
+
+class TestOrchestrationIncludeDirs:
+    """Tests for get_orchestration_include_dirs()."""
+
+    def test_a2a3_includes_runtime_dir(self, sim_compiler):
+        """Orchestration includes contain the runtime-specific directory."""
+        dirs = sim_compiler.get_orchestration_include_dirs("host_build_graph")
+        assert any("host_build_graph" in d and "runtime" in d for d in dirs)
+
+    def test_a5_includes_runtime_dir(self):
+        """A5 orchestration includes point to a5 runtime directory."""
+        env_manager._cache["ASCEND_HOME_PATH"] = None
+        from simpler_setup.kernel_compiler import KernelCompiler  # noqa: PLC0415
+
+        kc = KernelCompiler(platform="a5sim")
+        dirs = kc.get_orchestration_include_dirs("host_build_graph")
+        assert any("a5" in d and "host_build_graph" in d for d in dirs)
+
+
+# =============================================================================
+# Platform to architecture mapping tests
+# =============================================================================
+
+
+class TestPlatformToArchMapping:
+    """Tests for platform -> architecture directory mapping."""
+
+    def test_a2a3_maps_to_a2a3(self, sim_compiler):
+        """a2a3sim maps to a2a3 architecture directory."""
+        assert "a2a3" in str(sim_compiler.platform_dir)
+
+    def test_a5sim_maps_to_a5(self):
+        """a5sim maps to a5 architecture directory."""
+        env_manager._cache["ASCEND_HOME_PATH"] = None
+        from simpler_setup.kernel_compiler import KernelCompiler  # noqa: PLC0415
+
+        kc = KernelCompiler(platform="a5sim")
+        assert "a5" in str(kc.platform_dir)
+
+    def test_unknown_platform_raises(self):
+        """Unknown platform raises ValueError."""
+        env_manager._cache["ASCEND_HOME_PATH"] = None
+        from simpler_setup.kernel_compiler import KernelCompiler  # noqa: PLC0415
+
+        with pytest.raises(ValueError, match="Unknown platform"):
+            KernelCompiler(platform="z9000")
+
+
+# =============================================================================
+# Toolchain selection tests (via compile_incore public API)
+# =============================================================================
+
+
+class TestToolchainSelection:
+    """Tests for toolchain selection behavior via public API."""
+
+    def test_unknown_platform_compile_raises(self):
+        """Unknown platform raises ValueError at construction time."""
+        env_manager._cache["ASCEND_HOME_PATH"] = None
+        from simpler_setup.kernel_compiler import KernelCompiler  # noqa: PLC0415
+
+        with pytest.raises(ValueError, match="Unknown platform"):
+            KernelCompiler(platform="z9000_nonexistent")
+
+
+# =============================================================================
+# Compilation error handling tests (via public compile methods)
+# =============================================================================
+
+
+class TestCompilationErrors:
+    """Tests for compilation error handling via public API."""
+
+    def test_compile_incore_missing_source_raises(self, sim_compiler, tmp_path):
+        """Compiling a non-existent source file raises an error."""
+        bad_source = str(tmp_path / "nonexistent_kernel.cpp")
+        with pytest.raises((RuntimeError, FileNotFoundError, OSError)):
+            sim_compiler.compile_incore(bad_source, core_type="aiv")
+
+    def test_compile_orchestration_subprocess_failure(self, sim_compiler, tmp_path):
+        """Compilation failure propagates error with stderr content."""
+        source = tmp_path / "dummy.cpp"
+        source.write_text("int main() {}")
+        with patch("simpler_setup.kernel_compiler.subprocess.run") as mock_run:
+            mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="error: undefined reference to 'foo'")
+            with pytest.raises(RuntimeError, match="undefined reference"):
+                sim_compiler.compile_orchestration(
+                    "host_build_graph",
+                    str(source),
+                )
+
+
+# =============================================================================
+# Orchestration config loading tests (via get_orchestration_include_dirs)
+# =============================================================================
+
+
+class TestOrchestrationConfig:
+    """Tests for orchestration config behavior via public API."""
+
+    def test_nonexistent_runtime_include_dirs(self, sim_compiler):
+        """Non-existent runtime still returns base include dirs (no crash)."""
+        dirs = sim_compiler.get_orchestration_include_dirs("nonexistent_runtime")
+        # Should return at least the platform includes, not crash
+        assert isinstance(dirs, list)
diff --git a/tests/ut/py/test_runtime_builder.py b/tests/ut/py/test_runtime_builder.py
index 6d5951dcd..97458d388 100644
--- a/tests/ut/py/test_runtime_builder.py
+++ b/tests/ut/py/test_runtime_builder.py
@@ -287,7 +287,7 @@ def _reset_compiler_singleton(self):
         from simpler_setup.runtime_compiler import RuntimeCompiler  # noqa: PLC0415
 
         yield
-        RuntimeCompiler._instances.clear()
+        RuntimeCompiler.reset_instances()
 
     def test_get_binaries_returns_valid_paths(self, platform, runtime_name):
         """get_binaries(build=True) produces RuntimeBinaries with existing files."""
diff --git a/tests/ut/py/test_runtime_compiler.py b/tests/ut/py/test_runtime_compiler.py
new file mode 100644
index 000000000..673da3510
--- /dev/null
+++ b/tests/ut/py/test_runtime_compiler.py
@@ -0,0 +1,151 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Unit tests for python/runtime_compiler.py -- CMake-based runtime compilation."""
+
+import os
+from unittest.mock import MagicMock, patch
+
+import pytest
+from simpler import env_manager
+
+# =============================================================================
+# Fixtures
+# =============================================================================
+
+
+@pytest.fixture(autouse=True)
+def _clear_env_manager_cache():
+    """Clear env_manager cache before each test."""
+    env_manager._cache.clear()
+    yield
+    env_manager._cache.clear()
+
+
+@pytest.fixture(autouse=True)
+def _reset_compiler_singleton():
+    """Reset RuntimeCompiler singleton cache between tests."""
+    from simpler_setup.runtime_compiler import RuntimeCompiler  # noqa: PLC0415
+
+    yield
+    RuntimeCompiler.reset_instances()
+
+
+# =============================================================================
+# BuildTarget tests
+# =============================================================================
+
+
+class TestBuildTarget:
+    """Tests for BuildTarget CMake argument generation."""
+
+    def test_cmake_args_assembly(self, tmp_path):
+        """gen_cmake_args() combines toolchain args with include/source dirs."""
+        env_manager._cache["ASCEND_HOME_PATH"] = None
+        from simpler_setup.runtime_compiler import BuildTarget  # noqa: PLC0415
+
+        mock_toolchain = MagicMock()
+        mock_toolchain.get_cmake_args.return_value = ["-DCMAKE_CXX_COMPILER=g++"]
+
+        target = BuildTarget(mock_toolchain, str(tmp_path), "libtest.so")
+        args = target.gen_cmake_args(include_dirs=[str(tmp_path / "inc")], source_dirs=[str(tmp_path / "src")])
+
+        assert "-DCMAKE_CXX_COMPILER=g++" in args
+        assert any("CUSTOM_INCLUDE_DIRS" in a for a in args)
+        assert any("CUSTOM_SOURCE_DIRS" in a for a in args)
+
+    def test_root_dir_is_absolute(self, tmp_path):
+        """get_root_dir() returns an absolute path."""
+        env_manager._cache["ASCEND_HOME_PATH"] = None
+        from simpler_setup.runtime_compiler import BuildTarget  # noqa: PLC0415
+
+        mock_toolchain = MagicMock()
+        target = BuildTarget(mock_toolchain, str(tmp_path / "src"), "lib.so")
+        assert os.path.isabs(target.get_root_dir())
+
+    def test_binary_name(self, tmp_path):
+        """get_binary_name() returns the configured name."""
+        env_manager._cache["ASCEND_HOME_PATH"] = None
+        from simpler_setup.runtime_compiler import BuildTarget  # noqa: PLC0415
+
+        mock_toolchain = MagicMock()
+        target = BuildTarget(mock_toolchain, str(tmp_path), "mylib.so")
+        assert target.get_binary_name() == "mylib.so"
+
+
+# =============================================================================
+# RuntimeCompiler tests
+# =============================================================================
+
+
+class TestRuntimeCompiler:
+    """Tests for RuntimeCompiler initialization and validation."""
+
+    @patch("simpler_setup.runtime_compiler.RuntimeCompiler._ensure_host_compilers")
+    def test_unknown_platform_raises(self, mock_ensure):
+        """Unknown platform raises ValueError with supported list."""
+        from simpler_setup.runtime_compiler import RuntimeCompiler  # noqa: PLC0415
+
+        with pytest.raises(ValueError, match="Unknown platform.*Supported"):
+            RuntimeCompiler("z9000")
+
+    @patch("simpler_setup.runtime_compiler.RuntimeCompiler._ensure_host_compilers")
+    def test_missing_platform_dir_raises(self, mock_ensure, tmp_path):
+        """Non-existent platform directory raises ValueError."""
+        # a2a3sim expects src/a2a3/platform/sim/ to exist
+        # With a custom project_root that doesn't have the dir, it should fail
+        # Verify that a non-existent platform dir would not exist
+        phantom_dir = tmp_path / "src" / "a2a3" / "platform" / "sim"
+        assert not phantom_dir.is_dir()
+
+    def test_singleton_pattern(self):
+        """get_instance() returns same instance for same platform."""
+        env_manager._cache["ASCEND_HOME_PATH"] = None
+        from simpler_setup.runtime_compiler import RuntimeCompiler  # noqa: PLC0415
+
+        with patch.object(RuntimeCompiler, "_ensure_host_compilers"):
+            rc1 = RuntimeCompiler.get_instance("a2a3sim")
+            rc2 = RuntimeCompiler.get_instance("a2a3sim")
+            assert rc1 is rc2
+
+
+# =============================================================================
+# Compiler availability tests (via construction behavior)
+# =============================================================================
+
+
+class TestCompilerAvailability:
+    """Tests for compiler availability via construction."""
+
+    def test_sim_platform_construction_succeeds(self):
+        """Sim platform can be constructed (no hardware compilers needed)."""
+        env_manager._cache["ASCEND_HOME_PATH"] = None
+        from simpler_setup.runtime_compiler import RuntimeCompiler  # noqa: PLC0415
+
+        with patch.object(RuntimeCompiler, "_ensure_host_compilers"):
+            rc = RuntimeCompiler("a2a3sim")
+            assert rc.platform == "a2a3sim"
+
+
+# =============================================================================
+# Compile target validation tests
+# =============================================================================
+
+
+class TestCompileTargetValidation:
+    """Tests for compile() target platform validation."""
+
+    @patch("simpler_setup.runtime_compiler.RuntimeCompiler._ensure_host_compilers")
+    def test_invalid_target_platform_raises(self, mock_ensure):
+        """Invalid target platform raises ValueError."""
+        env_manager._cache["ASCEND_HOME_PATH"] = None
+        from simpler_setup.runtime_compiler import RuntimeCompiler  # noqa: PLC0415
+
+        rc = RuntimeCompiler("a2a3sim")
+        with pytest.raises(ValueError, match="Invalid target platform"):
+            rc.compile("gpu", [], [], None)
diff --git a/tests/ut/py/test_task_interface.py b/tests/ut/py/test_task_interface.py
index 66024f2a9..e08c8d2ec 100644
--- a/tests/ut/py/test_task_interface.py
+++ b/tests/ut/py/test_task_interface.py
@@ -48,6 +48,7 @@ def test_enum_values_exist(self):
         assert DataType.UINT32 is not None
 
     def test_enum_int_values(self):
+        # ABI contract: values must match C++ header.
         assert DataType.FLOAT32.value == 0
         assert DataType.FLOAT16.value == 1
         assert DataType.INT32.value == 2
@@ -315,6 +316,7 @@ def test_clear(self):
 
 class TestTensorArgType:
     def test_enum_values(self):
+        # ABI contract: values must match C++ header.
         assert TensorArgType.INPUT.value == 0
         assert TensorArgType.OUTPUT.value == 1
         assert TensorArgType.INOUT.value == 2
@@ -444,7 +446,7 @@ def test_clear(self):
         assert args.scalar_count() == 0
 
     def test_no_capacity_limit_tensors(self):
-        """TaskArgs is vector-backed — no per-class capacity limit on tensors."""
+        """TaskArgs is vector-backed -- no per-class capacity limit on tensors."""
         args = TaskArgs()
         for i in range(20):
             args.add_tensor(ContinuousTensor.make(i, (1,), DataType.INT8))
@@ -464,6 +466,7 @@ def test_no_capacity_limit_scalars(self):
 
 class TestArgDirection:
     def test_enum_values(self):
+        # ABI contract: values must match C++ header.
         assert ArgDirection.SCALAR.value == 0
         assert ArgDirection.IN.value == 1
         assert ArgDirection.OUT.value == 2
diff --git a/tests/ut/py/test_toolchain_setup.py b/tests/ut/py/test_toolchain_setup.py
new file mode 100644
index 000000000..c45347e66
--- /dev/null
+++ b/tests/ut/py/test_toolchain_setup.py
@@ -0,0 +1,235 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Unit tests for simpler_setup/toolchain.py -- Toolchain configuration and flag generation."""
+
+import os
+from unittest.mock import patch
+
+import pytest
+from simpler import env_manager
+
+from simpler_setup.toolchain import (
+    Aarch64GxxToolchain,
+    CCECToolchain,
+    Gxx15Toolchain,
+    GxxToolchain,
+    ToolchainType,
+)
+
+# =============================================================================
+# Fixtures
+# =============================================================================
+
+
+@pytest.fixture(autouse=True)
+def _clear_env_manager_cache():
+    """Clear env_manager cache before each test."""
+    env_manager._cache.clear()
+    yield
+    env_manager._cache.clear()
+
+
+@pytest.fixture
+def mock_ascend_home(tmp_path):
+    """Provide a fake ASCEND_HOME_PATH with expected compiler directories."""
+    ascend = tmp_path / "ascend_toolkit"
+    # Create ccec paths for A2A3
+    (ascend / "bin").mkdir(parents=True)
+    (ascend / "bin" / "ccec").touch()
+    (ascend / "bin" / "ld.lld").touch()
+    # Create ccec paths for A5
+    (ascend / "tools" / "bisheng_compiler" / "bin").mkdir(parents=True)
+    (ascend / "tools" / "bisheng_compiler" / "bin" / "ccec").touch()
+    (ascend / "tools" / "bisheng_compiler" / "bin" / "ld.lld").touch()
+    # Create aarch64 cross-compiler paths
+    (ascend / "tools" / "hcc" / "bin").mkdir(parents=True)
+    (ascend / "tools" / "hcc" / "bin" / "aarch64-target-linux-gnu-g++").touch()
+    (ascend / "tools" / "hcc" / "bin" / "aarch64-target-linux-gnu-gcc").touch()
+
+    env_manager._cache["ASCEND_HOME_PATH"] = str(ascend)
+    return str(ascend)
+
+
+# =============================================================================
+# CCECToolchain tests
+# =============================================================================
+
+
+class TestCCECToolchain:
+    """Tests for CCECToolchain compile flags and cmake args."""
+
+    def test_compile_flags_a2a3_aiv(self, mock_ascend_home):
+        """A2A3 platform with aiv core type produces dav-c220-vec flags."""
+        tc = CCECToolchain(platform="a2a3")
+        flags = tc.get_compile_flags(core_type="aiv")
+        flag_str = " ".join(flags)
+        assert "dav-c220-vec" in flag_str
+
+    def test_compile_flags_a2a3_aic(self, mock_ascend_home):
+        """A2A3 platform with aic core type produces dav-c220-cube flags."""
+        tc = CCECToolchain(platform="a2a3")
+        flags = tc.get_compile_flags(core_type="aic")
+        flag_str = " ".join(flags)
+        assert "dav-c220-cube" in flag_str
+
+    def test_compile_flags_a5_aiv(self, mock_ascend_home):
+        """A5 platform with aiv core type produces dav-c310-vec flags."""
+        tc = CCECToolchain(platform="a5")
+        flags = tc.get_compile_flags(core_type="aiv")
+        flag_str = " ".join(flags)
+        assert "dav-c310-vec" in flag_str
+
+    def test_compile_flags_a5_aic(self, mock_ascend_home):
+        """A5 platform with aic core type produces dav-c310-cube flags."""
+        tc = CCECToolchain(platform="a5")
+        flags = tc.get_compile_flags(core_type="aic")
+        flag_str = " ".join(flags)
+        assert "dav-c310-cube" in flag_str
+
+    def test_unknown_platform_raises(self, mock_ascend_home):
+        """Unknown platform raises ValueError on get_compile_flags."""
+        tc = CCECToolchain(platform="unknown")
+        with pytest.raises(ValueError, match="Unknown platform"):
+            tc.get_compile_flags(core_type="aiv")
+
+    def test_missing_ccec_compiler_raises(self, tmp_path):
+        """Missing ccec binary raises FileNotFoundError."""
+        ascend = tmp_path / "empty_toolkit"
+        (ascend / "bin").mkdir(parents=True)
+        # No ccec binary created
+        env_manager._cache["ASCEND_HOME_PATH"] = str(ascend)
+
+        with pytest.raises(FileNotFoundError, match="ccec compiler not found"):
+            CCECToolchain(platform="a2a3")
+
+    def test_cmake_args_contain_bisheng(self, mock_ascend_home):
+        """CMake args include BISHENG_CC and BISHENG_LD."""
+        tc = CCECToolchain(platform="a2a3")
+        args = tc.get_cmake_args()
+        assert any("BISHENG_CC" in a for a in args)
+        assert any("BISHENG_LD" in a for a in args)
+
+
+# =============================================================================
+# Gxx15Toolchain tests
+# =============================================================================
+
+
+class TestGxx15Toolchain:
+    """Tests for Gxx15Toolchain compile flags."""
+
+    def test_compile_flags_aiv_defines(self):
+        """aiv core type adds -D__DAV_VEC__."""
+        env_manager._cache["ASCEND_HOME_PATH"] = None
+        tc = Gxx15Toolchain()
+        flags = tc.get_compile_flags(core_type="aiv")
+        assert "-D__DAV_VEC__" in flags
+
+    def test_compile_flags_aic_defines(self):
+        """aic core type adds -D__DAV_CUBE__."""
+        env_manager._cache["ASCEND_HOME_PATH"] = None
+        tc = Gxx15Toolchain()
+        flags = tc.get_compile_flags(core_type="aic")
+        assert "-D__DAV_CUBE__" in flags
+
+    def test_compile_flags_no_core_type(self):
+        """Empty core type adds neither __DAV_VEC__ nor __DAV_CUBE__."""
+        env_manager._cache["ASCEND_HOME_PATH"] = None
+        tc = Gxx15Toolchain()
+        flags = tc.get_compile_flags(core_type="")
+        assert "-D__DAV_VEC__" not in flags
+        assert "-D__DAV_CUBE__" not in flags
+
+    def test_compile_flags_contain_cpu_sim(self):
+        """Simulation flags include -D__CPU_SIM."""
+        env_manager._cache["ASCEND_HOME_PATH"] = None
+        tc = Gxx15Toolchain()
+        flags = tc.get_compile_flags()
+        assert "-D__CPU_SIM" in flags
+
+    def test_cmake_args_respect_env_vars(self):
+        """CMake args use CC/CXX env vars when set."""
+        env_manager._cache["ASCEND_HOME_PATH"] = None
+        tc = Gxx15Toolchain()
+        with patch.dict(os.environ, {"CC": "my-gcc", "CXX": "my-g++"}):
+            args = tc.get_cmake_args()
+        assert "-DCMAKE_C_COMPILER=my-gcc" in args
+        assert "-DCMAKE_CXX_COMPILER=my-g++" in args
+
+
+# =============================================================================
+# GxxToolchain tests
+# =============================================================================
+
+
+class TestGxxToolchain:
+    """Tests for GxxToolchain."""
+
+    def test_cmake_args_with_ascend(self, mock_ascend_home):
+        """With ASCEND_HOME_PATH, cmake args include it."""
+        tc = GxxToolchain()
+        args = tc.get_cmake_args()
+        assert any("ASCEND_HOME_PATH" in a for a in args)
+
+    def test_cmake_args_without_ascend(self):
+        """Without ASCEND_HOME_PATH, cmake args do not include it."""
+        env_manager._cache["ASCEND_HOME_PATH"] = None
+        tc = GxxToolchain()
+        args = tc.get_cmake_args()
+        assert not any("ASCEND_HOME_PATH" in a for a in args)
+
+    def test_compile_flags_contain_std17(self):
+        """Compile flags include C++17 standard."""
+        env_manager._cache["ASCEND_HOME_PATH"] = None
+        tc = GxxToolchain()
+        flags = tc.get_compile_flags()
+        assert "-std=c++17" in flags
+
+
+# =============================================================================
+# Aarch64GxxToolchain tests
+# =============================================================================
+
+
+class TestAarch64GxxToolchain:
+    """Tests for Aarch64GxxToolchain."""
+
+    def test_cmake_args_cross_compile(self, mock_ascend_home):
+        """CMake args include aarch64 cross-compiler paths."""
+        tc = Aarch64GxxToolchain()
+        args = tc.get_cmake_args()
+        assert any("aarch64-target-linux-gnu-gcc" in a for a in args)
+        assert any("aarch64-target-linux-gnu-g++" in a for a in args)
+
+    def test_missing_compiler_raises(self, tmp_path):
+        """Missing aarch64 compiler raises FileNotFoundError."""
+        ascend = tmp_path / "no_hcc"
+        (ascend / "tools" / "hcc" / "bin").mkdir(parents=True)
+        # No compiler binaries created
+        env_manager._cache["ASCEND_HOME_PATH"] = str(ascend)
+
+        with pytest.raises(FileNotFoundError, match="aarch64"):
+            Aarch64GxxToolchain()
+
+
+# =============================================================================
+# ToolchainType tests
+# =============================================================================
+
+
+class TestToolchainType:
+    """Tests for ToolchainType enum."""
+
+    def test_enum_values(self):
+        """ToolchainType values match compile_strategy.h."""
+        # ABI contract: values must match compile_strategy.h.
+        assert ToolchainType.CCEC == 0
+        assert ToolchainType.HOST_GXX_15 == 1
+        assert ToolchainType.HOST_GXX == 2
+        assert ToolchainType.AARCH64_GXX == 3