diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt
index ced571d80..92face020 100644
--- a/tests/ut/cpp/CMakeLists.txt
+++ b/tests/ut/cpp/CMakeLists.txt
@@ -74,6 +74,49 @@ if(NOT GTEST_LIB OR NOT GTEST_MAIN_LIB)
     set(GTEST_INCLUDE_DIRS "")  # include dirs are carried by the gtest target
 endif()
 
+# ---------------------------------------------------------------------------
+# PTO2 runtime sources and stubs for a2a3 ring-buffer / tensormap tests
+# ---------------------------------------------------------------------------
+set(A2A3_RUNTIME_DIR ${CMAKE_SOURCE_DIR}/../../../src/a2a3/runtime/tensormap_and_ringbuffer/runtime)
+set(PTO2_STUB_SOURCES ${CMAKE_SOURCE_DIR}/stubs/test_stubs.cpp)
+set(PTO2_RUNTIME_SOURCES
+    ${A2A3_RUNTIME_DIR}/pto_ring_buffer.cpp
+    ${A2A3_RUNTIME_DIR}/pto_shared_memory.cpp
+    ${A2A3_RUNTIME_DIR}/pto_scheduler.cpp
+    ${A2A3_RUNTIME_DIR}/pto_tensormap.cpp
+)
+
+set(PTO2_COMMON_INCLUDE_DIRS
+    ${CMAKE_SOURCE_DIR}/../../../src/a2a3/runtime/tensormap_and_ringbuffer/orchestration
+    ${CMAKE_SOURCE_DIR}/../../../src/a2a3/runtime/tensormap_and_ringbuffer/runtime
+    ${CMAKE_SOURCE_DIR}/../../../src/a2a3/runtime/tensormap_and_ringbuffer/common
+    ${CMAKE_SOURCE_DIR}/../../../src/a2a3/platform/include
+    ${CMAKE_SOURCE_DIR}/../../../src/common/task_interface
+)
+
+function(add_a2a3_pto2_runtime_test name)
+    cmake_parse_arguments(ARG "" "" "SOURCES;EXTRA_SOURCES" ${ARGN})
+    set(_all_sources ${ARG_SOURCES} ${PTO2_STUB_SOURCES})
+    foreach(src ${ARG_SOURCES} ${ARG_EXTRA_SOURCES})
+        if(EXISTS ${src})
+            list(APPEND _all_sources ${src})
+        endif()
+    endforeach()
+    add_executable(${name} ${_all_sources})
+    target_include_directories(${name} PRIVATE
+        ${GTEST_INCLUDE_DIRS}
+        ${PTO2_COMMON_INCLUDE_DIRS}
+    )
+    target_compile_options(${name} PRIVATE -D_GLIBCXX_USE_CXX11_ABI=0)
+    target_link_libraries(${name} PRIVATE
+        ${GTEST_MAIN_LIB}
+        ${GTEST_LIB}
+        pthread
+    )
+    add_test(NAME ${name} COMMAND ${name})
+    set_tests_properties(${name} PROPERTIES LABELS "no_hardware")
+endfunction()
+
 # ---------------------------------------------------------------------------
 # Distributed runtime sources under test
 # ---------------------------------------------------------------------------
@@ -151,13 +194,6 @@ function(add_a5_pto2_test name src)
     add_test(NAME ${name} COMMAND ${name})
 endfunction()
 
-enable_testing()
-
-add_hierarchical_test(test_tensormap  test_tensormap.cpp)
-add_hierarchical_test(test_ring  test_ring.cpp)
-add_hierarchical_test(test_scope      test_scope.cpp)
-add_hierarchical_test(test_orchestrator test_orchestrator.cpp)
-add_hierarchical_test(test_scheduler  test_scheduler.cpp)
 function(add_task_interface_test name src)
     add_executable(${name} ${src})
     target_include_directories(${name} PRIVATE
@@ -173,9 +209,63 @@ function(add_task_interface_test name src)
     add_test(NAME ${name} COMMAND ${name})
 endfunction()
 
-add_task_interface_test(test_child_memory test_child_memory.cpp)
-add_a2a3_pto2_test(test_a2a3_pto2_fatal test_a2a3_pto2_fatal.cpp)
-add_a5_pto2_test(test_a5_pto2_fatal test_a5_pto2_fatal.cpp)
+enable_testing()
+
+# ---------------------------------------------------------------------------
+# Hierarchical runtime tests (src/common/hierarchical/)
+# ---------------------------------------------------------------------------
+add_hierarchical_test(test_tensormap  hierarchical/test_tensormap.cpp)
+add_hierarchical_test(test_ring  hierarchical/test_ring.cpp)
+add_hierarchical_test(test_scope      hierarchical/test_scope.cpp)
+add_hierarchical_test(test_orchestrator hierarchical/test_orchestrator.cpp)
+add_hierarchical_test(test_scheduler  hierarchical/test_scheduler.cpp)
+
+# ---------------------------------------------------------------------------
+# Types / task_interface tests (src/common/task_interface/)
+# ---------------------------------------------------------------------------
+add_task_interface_test(test_child_memory types/test_child_memory.cpp)
+
+# ---------------------------------------------------------------------------
+# PTO2 A2A3 tests (src/a2a3/runtime/tensormap_and_ringbuffer/)
+# ---------------------------------------------------------------------------
+add_a2a3_pto2_test(test_a2a3_pto2_fatal pto2_a2a3/test_a2a3_pto2_fatal.cpp)
+
+# PTO2 runtime-linked tests (tensormap, orchestrator, coupling, boundary)
+add_a2a3_pto2_runtime_test(test_tensormap_edge
+    SOURCES pto2_a2a3/test_tensormap_edge.cpp
+    EXTRA_SOURCES ${PTO2_RUNTIME_SOURCES}
+)
+add_a2a3_pto2_runtime_test(test_coupling
+    SOURCES pto2_a2a3/test_coupling.cpp
+    EXTRA_SOURCES ${PTO2_RUNTIME_SOURCES} ${A2A3_RUNTIME_DIR}/pto_orchestrator.cpp
+)
+add_a2a3_pto2_runtime_test(test_coupling_stub
+    SOURCES pto2_a2a3/test_coupling_stub.cpp
+    EXTRA_SOURCES ${PTO2_RUNTIME_SOURCES}
+)
+add_a2a3_pto2_runtime_test(test_orchestrator_submit
+    SOURCES pto2_a2a3/test_orchestrator_submit.cpp
+    EXTRA_SOURCES ${PTO2_RUNTIME_SOURCES} ${A2A3_RUNTIME_DIR}/pto_orchestrator.cpp
+)
+add_a2a3_pto2_runtime_test(test_orchestrator_fatal
+    SOURCES pto2_a2a3/test_orchestrator_fatal.cpp
+    EXTRA_SOURCES ${PTO2_RUNTIME_SOURCES} ${A2A3_RUNTIME_DIR}/pto_orchestrator.cpp
+)
+add_a2a3_pto2_runtime_test(test_runtime_lifecycle
+    SOURCES pto2_a2a3/test_runtime_lifecycle.cpp
+    EXTRA_SOURCES ${PTO2_RUNTIME_SOURCES}
+                  ${A2A3_RUNTIME_DIR}/pto_runtime2.cpp
+                  ${A2A3_RUNTIME_DIR}/pto_orchestrator.cpp
+)
+add_a2a3_pto2_runtime_test(test_boundary_edge
+    SOURCES pto2_a2a3/test_boundary_edge.cpp
+    EXTRA_SOURCES ${PTO2_RUNTIME_SOURCES}
+)
+
+# ---------------------------------------------------------------------------
+# PTO2 A5 tests (src/a5/runtime/tensormap_and_ringbuffer/)
+# ---------------------------------------------------------------------------
+add_a5_pto2_test(test_a5_pto2_fatal pto2_a5/test_a5_pto2_fatal.cpp)
 
 # Hardware-gated tests.  Block is only entered when the project is configured
 # with -DSIMPLER_ENABLE_HARDWARE_TESTS=ON.  CI's no-hw `ut` job does not pass
@@ -238,5 +328,5 @@ if(SIMPLER_ENABLE_HARDWARE_TESTS)
         )
     endfunction()
 
-    add_comm_api_test(test_hccl_comm test_hccl_comm.cpp)
+    add_comm_api_test(test_hccl_comm hardware/test_hccl_comm.cpp)
 endif()
diff --git a/tests/ut/cpp/test_hccl_comm.cpp b/tests/ut/cpp/hardware/test_hccl_comm.cpp
similarity index 96%
rename from tests/ut/cpp/test_hccl_comm.cpp
rename to tests/ut/cpp/hardware/test_hccl_comm.cpp
index 858c488de..73c5cb91e 100644
--- a/tests/ut/cpp/test_hccl_comm.cpp
+++ b/tests/ut/cpp/hardware/test_hccl_comm.cpp
@@ -12,12 +12,12 @@
 /*
  * Hardware UT guarding the CANN/HCCL-private ABI coupling in comm_hccl.cpp.
  *
- * The call chain (dlopen → create_device_context → ensure_acl_ready_ctx →
- * aclrtCreateStream → comm_init → comm_alloc_windows → ...) is not the
- * interesting part — the interesting part is *what's inside* CommContext
+ * The call chain (dlopen -> create_device_context -> ensure_acl_ready_ctx ->
+ * aclrtCreateStream -> comm_init -> comm_alloc_windows -> ...) is not the
+ * interesting part -- the interesting part is *what's inside* CommContext
  * after comm_alloc_windows returns.  That struct comes from one of:
  *
- *   - MESH topology: `reinterpret_cast<CommContext*>(HCCL's return ptr)` —
+ *   - MESH topology: `reinterpret_cast<CommContext*>(HCCL's return ptr)` --
  *     our layout is *assumed* to match HCCL's internal MESH context.
  *   - RING topology: our parser reads HcclOpResParam / HcclRankRelationResV2
  *     field-by-field using offsetof against reverse-engineered struct defs.
@@ -40,7 +40,7 @@
  * gate SIMPLER_ENABLE_HARDWARE_TESTS.  Device allocation is driven by
  * CTest RESOURCE_GROUPS + --resource-spec-file.
  *
- * Linking strategy: libhost_runtime.so is dlopen'd — it is the subject
+ * Linking strategy: libhost_runtime.so is dlopen'd -- it is the subject
  * under test and mirrors how ChipWorker loads a runtime backend in
  * production.  libascendcl.so is linked directly at compile time because
  * it is generic CANN infra; going through dlsym for acl* here buys nothing
@@ -122,14 +122,14 @@ constexpr int EXIT_WINDOW_SIZE = 50;
 // the CommContext returned by HCCL (MESH reinterpret_cast) or built by our
 // RING parser actually contains the fields we expect at the offsets we
 // expect.  Failure here means our reverse-engineered CANN ABI disagrees with
-// the live HCCL build — the CANN-coupling fragility this test is here for.
+// the live HCCL build -- the CANN-coupling fragility this test is here for.
 constexpr int EXIT_CTX_MEMCPY = 55;
 constexpr int EXIT_CTX_FIELDS = 56;
 constexpr int EXIT_BARRIER = 60;
 constexpr int EXIT_DESTROY = 70;
 
 int run_rank(int rank, int nranks, int device_id, const char *rootinfo_path) {
-    // libhost_runtime.so is the subject under test — dlopen mirrors
+    // libhost_runtime.so is the subject under test -- dlopen mirrors
     // ChipWorker.  libascendcl is linked in, so acl* is available directly.
     void *host_handle = dlopen(PTO_HOST_RUNTIME_LIB_PATH, RTLD_NOW | RTLD_LOCAL);
     if (host_handle == nullptr) {
@@ -215,7 +215,7 @@ int run_rank(int rank, int nranks, int device_id, const char *rootinfo_path) {
                                host_ctx.windowsIn[rank] != local_base) {
                         fprintf(
                             stderr,
-                            "[rank %d] CommContext field mismatch — CANN ABI drift?\n"
+                            "[rank %d] CommContext field mismatch -- CANN ABI drift?\n"
                             "  got:      rankId=%u rankNum=%u winSize=%lu windowsIn[%d]=0x%lx\n"
                             "  expected: rankId=%d rankNum=%d winSize=%zu windowsIn[%d]=0x%lx\n",
                             rank, host_ctx.rankId, host_ctx.rankNum, static_cast<unsigned long>(host_ctx.winSize), rank,
diff --git a/tests/ut/cpp/test_orchestrator.cpp b/tests/ut/cpp/hierarchical/test_orchestrator.cpp
similarity index 96%
rename from tests/ut/cpp/test_orchestrator.cpp
rename to tests/ut/cpp/hierarchical/test_orchestrator.cpp
index 14919b11e..82fac02d7 100644
--- a/tests/ut/cpp/test_orchestrator.cpp
+++ b/tests/ut/cpp/hierarchical/test_orchestrator.cpp
@@ -48,7 +48,7 @@ struct OrchestratorFixture : public ::testing::Test {
 
     void TearDown() override { allocator.shutdown(); }
 
-    // Per-slot accessor — slot state lives inside the Ring now.
+    // Per-slot accessor -- slot state lives inside the Ring now.
     TaskSlotState &S(TaskSlot id) { return *allocator.slot_state(id); }
 
     // Helper: build a TaskArgs whose only tensor has the given (data, tag).
@@ -86,7 +86,7 @@ TEST_F(OrchestratorFixture, DependentTaskIsPending) {
     TaskSlot a_slot;
     rq.try_pop(a_slot);
 
-    // Task B reads INPUT at the same key — depends on A
+    // Task B reads INPUT at the same key -- depends on A
     auto args_b = single_tensor_args(0xBEEF, TensorArgType::INPUT);
     auto b = orch.submit_next_level(0xDEAD, args_b, cfg);
     EXPECT_EQ(S(b.task_slot).state.load(), TaskState::PENDING);
@@ -151,7 +151,7 @@ TEST_F(OrchestratorFixture, NoDepTagSkipsDependencyTracking) {
     TaskSlot drain_slot;
     rq.try_pop(drain_slot);
 
-    // Second task references same key but tagged NO_DEP — should be independent
+    // Second task references same key but tagged NO_DEP -- should be independent
     auto args_b = single_tensor_args(0xAAAA, TensorArgType::NO_DEP);
     auto b = orch.submit_next_level(0xDEAD, args_b, cfg);
     EXPECT_EQ(S(b.task_slot).state.load(), TaskState::READY);
@@ -215,7 +215,7 @@ TEST_F(OrchestratorFixture, OutputAutoAllocsFromHeapRing) {
 
 TEST_F(OrchestratorFixture, InoutWiresCreatorAsFanin) {
     // INOUT is the only tag that pulls in the prior writer as a fanin
-    // producer — matching L2's pto_orchestrator.cpp Step B where only
+    // producer -- matching L2's pto_orchestrator.cpp Step B where only
     // INPUT / INOUT do tensor_map.lookup. Users who want a WaW dep on
     // the alloc-slot (so its HeapRing slab stays live while they write)
     // must tag the buffer INOUT.
@@ -250,7 +250,7 @@ TEST_F(OrchestratorFixture, InoutWiresCreatorAsFanin) {
 
 TEST_F(OrchestratorFixture, OutputAndOutputExistingAreInsertOnly) {
     // Contrast with INOUT: plain OUTPUT and OUTPUT_EXISTING are pure
-    // overwrites — insert into TensorMap, no lookup, so no fanin wire
+    // overwrites -- insert into TensorMap, no lookup, so no fanin wire
     // on the prior writer. Matches L2 semantics for both tags. Users
     // who need creator lifetime must tag the buffer INOUT.
     struct Case {
diff --git a/tests/ut/cpp/test_ring.cpp b/tests/ut/cpp/hierarchical/test_ring.cpp
similarity index 96%
rename from tests/ut/cpp/test_ring.cpp
rename to tests/ut/cpp/hierarchical/test_ring.cpp
index 05152972d..7f0689b7d 100644
--- a/tests/ut/cpp/test_ring.cpp
+++ b/tests/ut/cpp/hierarchical/test_ring.cpp
@@ -129,7 +129,7 @@ TEST(Ring, SlotStateIsPointerStable) {
     TaskSlotState *p0 = a.slot_state(r0.slot);
     ASSERT_NE(p0, nullptr);
 
-    // Push many more slots through — the deque may grow/chain, but the
+    // Push many more slots through -- the deque may grow/chain, but the
     // pointer we grabbed for slot 0 has to stay valid.
     for (int i = 0; i < 1000; ++i) {
         (void)a.alloc();
@@ -227,7 +227,7 @@ TEST(Ring, ScopeDepthMapsToRingIdx) {
 }
 
 TEST(Ring, PerRingHeapsAreDistinctMmaps) {
-    // Total VA = 4 × 4 KiB; verify each ring has its own mapping.
+    // Total VA = 4 x 4 KiB; verify each ring has its own mapping.
     Ring a;
     a.init(kSmallHeap, kQuickTimeoutMs);
 
@@ -241,7 +241,7 @@ TEST(Ring, PerRingHeapsAreDistinctMmaps) {
     for (int i = 0; i < MAX_RING_DEPTH; ++i) {
         for (int j = i + 1; j < MAX_RING_DEPTH; ++j) {
             EXPECT_NE(bases[i], bases[j])
-                << "rings " << i << " and " << j << " share a mapping — expected 4 separate mmaps";
+                << "rings " << i << " and " << j << " share a mapping -- expected 4 separate mmaps";
         }
     }
 }
@@ -292,7 +292,7 @@ TEST(Ring, RingsReclaimIndependently) {
     EXPECT_EQ(r1a.ring_idx, 1);
     EXPECT_EQ(r1b.ring_idx, 1);
 
-    // Ring 0 is untouched — this must succeed instantly, not time out.
+    // Ring 0 is untouched -- this must succeed instantly, not time out.
     auto r0 = a.alloc(HEAP_ALIGN, /*scope_depth=*/0);
     EXPECT_EQ(r0.ring_idx, 0);
     ASSERT_NE(r0.heap_ptr, nullptr);
@@ -322,7 +322,7 @@ TEST(Ring, InnerRingReclaimsWhileOuterHolds) {
     EXPECT_EQ(a.heap_top(0), HEAP_ALIGN);
     EXPECT_EQ(a.heap_tail(0), 0u);
 
-    // Churn on the inner ring — allocate, release, allocate, release, ...
+    // Churn on the inner ring -- allocate, release, allocate, release, ...
     for (int i = 0; i < 8; ++i) {
         auto inner = a.alloc(HEAP_ALIGN, /*scope_depth=*/1);
         a.release(inner.slot);
@@ -331,7 +331,7 @@ TEST(Ring, InnerRingReclaimsWhileOuterHolds) {
     // Outer ring unchanged (one live slab at offset 0).
     EXPECT_EQ(a.heap_top(0), HEAP_ALIGN);
     EXPECT_EQ(a.heap_tail(0), 0u);
-    // Inner ring reclaimed each slab — tail caught up to top.
+    // Inner ring reclaimed each slab -- tail caught up to top.
     EXPECT_EQ(a.heap_tail(1), a.heap_top(1));
 
     a.release(outer.slot);
diff --git a/tests/ut/cpp/test_scheduler.cpp b/tests/ut/cpp/hierarchical/test_scheduler.cpp
similarity index 98%
rename from tests/ut/cpp/test_scheduler.cpp
rename to tests/ut/cpp/hierarchical/test_scheduler.cpp
index f13dd240f..87c50a895 100644
--- a/tests/ut/cpp/test_scheduler.cpp
+++ b/tests/ut/cpp/hierarchical/test_scheduler.cpp
@@ -205,7 +205,7 @@ TEST_F(SchedulerFixture, DependentTaskDispatchedAfterProducerCompletes) {
 }
 
 // ===========================================================================
-// Group task tests — fixture with 2 MockWorkers
+// Group task tests -- fixture with 2 MockWorkers
 // ===========================================================================
 
 struct GroupSchedulerFixture : public ::testing::Test {
@@ -405,7 +405,7 @@ TEST_F(MixedTypeSchedulerFixture, SubTaskDispatchesWhileNextLevelPoolSaturated)
     EXPECT_TRUE(next_level_worker.is_running.load()) << "chip worker must still be busy";
 
     // Complete the sub task first; it reaches CONSUMED while the chip task
-    // is still running — demonstrating independent per-type dispatch.
+    // is still running -- demonstrating independent per-type dispatch.
     sub_worker.complete();
     wait_consumed(sub.task_slot);
     EXPECT_FALSE(is_consumed(chip.task_slot));
@@ -416,7 +416,7 @@ TEST_F(MixedTypeSchedulerFixture, SubTaskDispatchesWhileNextLevelPoolSaturated)
 
 TEST_F(GroupSchedulerFixture, GroupDependencyChain) {
     // Group A (2 workers) produces an OUTPUT at key 0xCAFE.
-    // Task B reads INPUT at the same key — depends on group A.
+    // Task B reads INPUT at the same key -- depends on group A.
     TaskArgs a0 = single_tensor_args(0xCAFE, TensorArgType::OUTPUT);
     TaskArgs a1 = single_tensor_args(0xCAFE, TensorArgType::OUTPUT);
     auto a = orch.submit_next_level_group(0xDEAD, {a0, a1}, cfg);
diff --git a/tests/ut/cpp/test_scope.cpp b/tests/ut/cpp/hierarchical/test_scope.cpp
similarity index 97%
rename from tests/ut/cpp/test_scope.cpp
rename to tests/ut/cpp/hierarchical/test_scope.cpp
index d8350d1c6..273b33bfc 100644
--- a/tests/ut/cpp/test_scope.cpp
+++ b/tests/ut/cpp/hierarchical/test_scope.cpp
@@ -43,7 +43,7 @@ TEST(Scope, SingleScope_ReleasesRegisteredTasks) {
 
 TEST(Scope, RegisterOutsideScopeIsNoop) {
     Scope sc;
-    sc.register_task(5);  // no open scope — should not throw
+    sc.register_task(5);  // no open scope -- should not throw
     EXPECT_EQ(sc.depth(), 0);
 }
 
diff --git a/tests/ut/cpp/test_tensormap.cpp b/tests/ut/cpp/hierarchical/test_tensormap.cpp
similarity index 100%
rename from tests/ut/cpp/test_tensormap.cpp
rename to tests/ut/cpp/hierarchical/test_tensormap.cpp
diff --git a/tests/ut/cpp/test_a2a3_pto2_fatal.cpp b/tests/ut/cpp/pto2_a2a3/test_a2a3_pto2_fatal.cpp
similarity index 90%
rename from tests/ut/cpp/test_a2a3_pto2_fatal.cpp
rename to tests/ut/cpp/pto2_a2a3/test_a2a3_pto2_fatal.cpp
index b4e2c8e00..1ea2aa042 100644
--- a/tests/ut/cpp/test_a2a3_pto2_fatal.cpp
+++ b/tests/ut/cpp/pto2_a2a3/test_a2a3_pto2_fatal.cpp
@@ -41,6 +41,8 @@ struct FakeRuntime {
     std::string last_fatal_message;
 };
 
+static_assert(offsetof(FakeRuntime, ops) == 0);  // Guard: reinterpret_cast below assumes ops is first member.
+
 FakeRuntime *as_fake(PTO2Runtime *rt) { return reinterpret_cast<FakeRuntime *>(rt); }
 
 TaskOutputTensors fake_submit(PTO2Runtime *rt, const MixedKernels &, const Arg &) {
@@ -87,20 +89,20 @@ TaskOutputTensors fake_alloc_tensors(PTO2Runtime *rt, const Arg &) {
 }
 
 const PTO2RuntimeOps kFakeOps = {
-    fake_submit,
-    fake_scope_begin,
-    fake_scope_end,
-    fake_orchestration_done,
-    fake_is_fatal,
-    fake_report_fatal,
-    fake_log,
-    fake_log,
-    fake_log,
-    fake_log,
-    fake_log,
-    fake_get_tensor_data,
-    fake_set_tensor_data,
-    fake_alloc_tensors,
+    .submit_task = fake_submit,
+    .scope_begin = fake_scope_begin,
+    .scope_end = fake_scope_end,
+    .orchestration_done = fake_orchestration_done,
+    .is_fatal = fake_is_fatal,
+    .report_fatal = fake_report_fatal,
+    .log_error = fake_log,
+    .log_warn = fake_log,
+    .log_info = fake_log,
+    .log_debug = fake_log,
+    .log_always = fake_log,
+    .get_tensor_data = fake_get_tensor_data,
+    .set_tensor_data = fake_set_tensor_data,
+    .alloc_tensors = fake_alloc_tensors,
 };
 
 class RuntimeBindingGuard {
diff --git a/tests/ut/cpp/pto2_a2a3/test_boundary_edge.cpp b/tests/ut/cpp/pto2_a2a3/test_boundary_edge.cpp
new file mode 100644
index 000000000..b17ff85ed
--- /dev/null
+++ b/tests/ut/cpp/pto2_a2a3/test_boundary_edge.cpp
@@ -0,0 +1,693 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Supplemental boundary-condition tests for:
+ *   1. ReadyQueue high-contention stress (8+ threads, exactly-once guarantee)
+ *   2. TaskAllocator double-destroy / re-init safety
+ *   3. Scheduler sequence counter near INT64 wrap
+ *   4. SharedMemory concurrent read/write of per-ring flow control
+ */
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <cstring>
+#include <set>
+#include <thread>
+#include <vector>
+
+#include "pto_ring_buffer.h"
+#include "pto_scheduler.h"
+#include "pto_shared_memory.h"
+#include "../test_helpers.h"
+
+// =============================================================================
+// 1. ReadyQueue high-contention stress
+// =============================================================================
+
+class ReadyQueueStressTest : public ::testing::Test {
+protected:
+    static constexpr uint64_t kCapacity = 512;
+    PTO2ReadyQueue queue;
+
+    void SetUp() override { ASSERT_TRUE(pto2_ready_queue_init(&queue, kCapacity)); }
+
+    void TearDown() override { pto2_ready_queue_destroy(&queue); }
+};
+
+// 8 producers / 8 consumers, high volume -- every item consumed exactly once
+TEST_F(ReadyQueueStressTest, EightProducersEightConsumers) {
+    constexpr int kItemsPerProducer = 2000;
+    constexpr int kProducers = 8;
+    constexpr int kConsumers = 8;
+    constexpr int kTotalItems = kItemsPerProducer * kProducers;
+
+    std::vector<PTO2TaskSlotState> items(kTotalItems);
+    for (int i = 0; i < kTotalItems; i++) {
+        items[i].fanin_count = i;
+    }
+
+    std::vector<std::atomic<int>> consumed_count(kTotalItems);
+    for (auto &c : consumed_count)
+        c.store(0, std::memory_order_relaxed);
+
+    std::atomic<int> producers_done{0};
+
+    auto producer = [&](int id) {
+        int base = id * kItemsPerProducer;
+        for (int i = 0; i < kItemsPerProducer; i++) {
+            while (!queue.push(&items[base + i])) {}
+        }
+        producers_done.fetch_add(1, std::memory_order_release);
+    };
+
+    auto consumer = [&](std::atomic<int> &local_count) {
+        while (true) {
+            PTO2TaskSlotState *item = queue.pop();
+            if (item) {
+                consumed_count[item->fanin_count].fetch_add(1, std::memory_order_relaxed);
+                local_count.fetch_add(1, std::memory_order_relaxed);
+            } else if (producers_done.load(std::memory_order_acquire) == kProducers) {
+                // Final drain
+                while ((item = queue.pop()) != nullptr) {
+                    consumed_count[item->fanin_count].fetch_add(1, std::memory_order_relaxed);
+                    local_count.fetch_add(1, std::memory_order_relaxed);
+                }
+                break;
+            }
+        }
+    };
+
+    std::vector<std::atomic<int>> per_consumer_count(kConsumers);
+    for (auto &c : per_consumer_count)
+        c.store(0);
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i < kProducers; i++) {
+        threads.emplace_back(producer, i);
+    }
+    for (int i = 0; i < kConsumers; i++) {
+        threads.emplace_back(consumer, std::ref(per_consumer_count[i]));
+    }
+    for (auto &t : threads)
+        t.join();
+
+    // Every item consumed exactly once
+    int total = 0;
+    for (int i = 0; i < kTotalItems; i++) {
+        EXPECT_EQ(consumed_count[i].load(), 1) << "Item " << i << " consumed " << consumed_count[i].load() << " times";
+        total += consumed_count[i].load();
+    }
+    EXPECT_EQ(total, kTotalItems);
+
+    // Work is distributed across consumers (not all consumed by one)
+    int active_consumers = 0;
+    for (int i = 0; i < kConsumers; i++) {
+        if (per_consumer_count[i].load() > 0) active_consumers++;
+    }
+    EXPECT_GT(active_consumers, 1) << "Work should be distributed across multiple consumers";
+}
+
+// Rapid fill-drain cycles under contention
+TEST_F(ReadyQueueStressTest, RapidFillDrainCycles) {
+    constexpr int kCycles = 100;
+    constexpr int kItemsPerCycle = static_cast<int>(kCapacity / 2);
+
+    std::vector<PTO2TaskSlotState> items(kItemsPerCycle);
+    for (int i = 0; i < kItemsPerCycle; i++) {
+        items[i].fanin_count = i;
+    }
+
+    for (int cycle = 0; cycle < kCycles; cycle++) {
+        std::atomic<int> push_done{0};
+        std::atomic<int> popped{0};
+
+        // 4 producers push in parallel
+        auto producer = [&](int id) {
+            int per_thread = kItemsPerCycle / 4;
+            int base = id * per_thread;
+            for (int i = 0; i < per_thread; i++) {
+                while (!queue.push(&items[base + i])) {}
+            }
+            push_done.fetch_add(1, std::memory_order_release);
+        };
+
+        // 4 consumers drain in parallel
+        auto consumer = [&]() {
+            while (true) {
+                PTO2TaskSlotState *s = queue.pop();
+                if (s) {
+                    popped.fetch_add(1, std::memory_order_relaxed);
+                } else if (push_done.load(std::memory_order_acquire) == 4) {
+                    while ((s = queue.pop()) != nullptr) {
+                        popped.fetch_add(1, std::memory_order_relaxed);
+                    }
+                    break;
+                }
+            }
+        };
+
+        std::vector<std::thread> threads;
+        for (int i = 0; i < 4; i++)
+            threads.emplace_back(producer, i);
+        for (int i = 0; i < 4; i++)
+            threads.emplace_back(consumer);
+        for (auto &t : threads)
+            t.join();
+
+        ASSERT_EQ(popped.load(), kItemsPerCycle) << "Cycle " << cycle << ": lost items";
+    }
+}
+
+// push_batch + pop_batch under contention
+TEST_F(ReadyQueueStressTest, BatchPushPopContention) {
+    constexpr int kBatchSize = 8;
+    constexpr int kBatches = 500;
+    constexpr int kProducers = 4;
+    constexpr int kTotalItems = kBatchSize * kBatches * kProducers;
+
+    std::vector<PTO2TaskSlotState> items(kTotalItems);
+    for (int i = 0; i < kTotalItems; i++)
+        items[i].fanin_count = i;
+
+    std::atomic<int> total_consumed{0};
+    std::atomic<int> producers_done{0};
+
+    auto producer = [&](int id) {
+        int base = id * kBatchSize * kBatches;
+        for (int b = 0; b < kBatches; b++) {
+            PTO2TaskSlotState *ptrs[kBatchSize];
+            for (int i = 0; i < kBatchSize; i++) {
+                ptrs[i] = &items[base + b * kBatchSize + i];
+            }
+            // push_batch may partially fail if queue is near full; retry
+            for (int i = 0; i < kBatchSize; i++) {
+                while (!queue.push(ptrs[i])) {}
+            }
+        }
+        producers_done.fetch_add(1, std::memory_order_release);
+    };
+
+    auto consumer = [&]() {
+        while (true) {
+            PTO2TaskSlotState *out[kBatchSize];
+            int n = queue.pop_batch(out, kBatchSize);
+            total_consumed.fetch_add(n, std::memory_order_relaxed);
+            if (n == 0 && producers_done.load(std::memory_order_acquire) == kProducers) {
+                // Final drain
+                while (true) {
+                    n = queue.pop_batch(out, kBatchSize);
+                    if (n == 0) break;
+                    total_consumed.fetch_add(n, std::memory_order_relaxed);
+                }
+                break;
+            }
+        }
+    };
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i < kProducers; i++)
+        threads.emplace_back(producer, i);
+    for (int i = 0; i < 4; i++)
+        threads.emplace_back(consumer);
+    for (auto &t : threads)
+        t.join();
+
+    EXPECT_EQ(total_consumed.load(), kTotalItems);
+}
+
+// =============================================================================
+// 2. TaskAllocator double-destroy / re-init safety
+// =============================================================================
+
+class TaskAllocatorDoubleDestroyTest : public ::testing::Test {
+protected:
+    static constexpr int32_t WINDOW_SIZE = 16;
+    static constexpr uint64_t HEAP_SIZE = 1024;
+
+    std::vector<PTO2TaskDescriptor> descriptors;
+    alignas(64) uint8_t heap_buf[1024]{};
+    std::atomic<int32_t> current_index{0};
+    std::atomic<int32_t> last_alive{0};
+    std::atomic<int32_t> error_code{PTO2_ERROR_NONE};
+    PTO2TaskAllocator allocator{};
+
+    void InitAllocator() {
+        descriptors.assign(WINDOW_SIZE, PTO2TaskDescriptor{});
+        std::memset(heap_buf, 0, sizeof(heap_buf));
+        current_index.store(0);
+        last_alive.store(0);
+        error_code.store(PTO2_ERROR_NONE);
+        allocator.init(descriptors.data(), WINDOW_SIZE, &current_index, &last_alive, heap_buf, HEAP_SIZE, &error_code);
+    }
+};
+
+// Re-init after use: allocator should work fresh
+TEST_F(TaskAllocatorDoubleDestroyTest, ReInitAfterUse) {
+    InitAllocator();
+
+    // Use the allocator
+    auto r1 = allocator.alloc(128);
+    ASSERT_FALSE(r1.failed());
+    auto r2 = allocator.alloc(128);
+    ASSERT_FALSE(r2.failed());
+    EXPECT_EQ(r2.task_id, 1);
+
+    // Re-init: should reset state
+    InitAllocator();
+
+    // Should start from task_id 0 again
+    auto r3 = allocator.alloc(64);
+    ASSERT_FALSE(r3.failed());
+    EXPECT_EQ(r3.task_id, 0) << "Re-init should reset task ID counter";
+    EXPECT_EQ(r3.slot, 0);
+}
+
+// Re-init with different heap size
+TEST_F(TaskAllocatorDoubleDestroyTest, ReInitDifferentHeapSize) {
+    InitAllocator();
+
+    auto r1 = allocator.alloc(HEAP_SIZE);
+    ASSERT_FALSE(r1.failed());
+    EXPECT_EQ(allocator.heap_top(), HEAP_SIZE);
+
+    // Re-init with same buffer but fresh state
+    InitAllocator();
+    EXPECT_EQ(allocator.heap_top(), 0u) << "Re-init resets heap_top";
+    EXPECT_EQ(allocator.heap_available(), HEAP_SIZE) << "Re-init restores full capacity";
+}
+
+// Re-init after error state: error flag should be clearable
+TEST_F(TaskAllocatorDoubleDestroyTest, ReInitClearsErrorState) {
+    InitAllocator();
+
+    // Force a deadlock error
+    auto r = allocator.alloc(HEAP_SIZE * 2);
+    EXPECT_TRUE(r.failed());
+    EXPECT_NE(error_code.load(), PTO2_ERROR_NONE);
+
+    // Re-init clears error
+    InitAllocator();
+    EXPECT_EQ(error_code.load(), PTO2_ERROR_NONE);
+
+    // Allocator should work again
+    auto r2 = allocator.alloc(64);
+    EXPECT_FALSE(r2.failed());
+}
+
+// Multiple re-init cycles: no resource leak or corruption
+TEST_F(TaskAllocatorDoubleDestroyTest, MultipleReInitCycles) {
+    for (int cycle = 0; cycle < 10; cycle++) {
+        InitAllocator();
+
+        for (int i = 0; i < WINDOW_SIZE - 1; i++) {
+            auto r = allocator.alloc(0);
+            ASSERT_FALSE(r.failed()) << "Cycle " << cycle << " alloc " << i;
+            EXPECT_EQ(r.task_id, i);
+        }
+    }
+}
+
+// Re-init with stale last_alive: allocator sees fresh state
+TEST_F(TaskAllocatorDoubleDestroyTest, ReInitIgnoresStaleLastAlive) {
+    InitAllocator();
+
+    // Advance state
+    auto r1 = allocator.alloc(64);
+    ASSERT_FALSE(r1.failed());
+    last_alive.store(5, std::memory_order_release);  // Stale value
+
+    // Re-init resets last_alive
+    InitAllocator();
+    EXPECT_EQ(last_alive.load(), 0);
+
+    auto r2 = allocator.alloc(64);
+    ASSERT_FALSE(r2.failed());
+    EXPECT_EQ(r2.task_id, 0);
+}
+
+// =============================================================================
+// 3. Scheduler sequence counter near INT64 wrap
+// =============================================================================
+
+class SequenceWrapTest : public ::testing::Test {
+protected:
+    static constexpr uint64_t QUEUE_CAP = 8;
+    PTO2ReadyQueueSlot slots[8]{};
+    PTO2ReadyQueue queue{};
+    PTO2TaskSlotState dummy[8]{};
+
+    void InitQueueAtSequence(int64_t start_seq) { test_ready_queue_init(&queue, slots, QUEUE_CAP, start_seq); }
+};
+
+// Sequence near INT64_MAX: push/pop should still work
+TEST_F(SequenceWrapTest, NearInt64Max) {
+    int64_t near_max = INT64_MAX - 16;
+    InitQueueAtSequence(near_max);
+
+    // Push and pop several items, crossing INT64_MAX
+    for (int i = 0; i < 5; i++) {
+        ASSERT_TRUE(queue.push(&dummy[i])) << "Push " << i << " near INT64_MAX";
+    }
+
+    for (int i = 0; i < 5; i++) {
+        PTO2TaskSlotState *s = queue.pop();
+        ASSERT_NE(s, nullptr) << "Pop " << i << " near INT64_MAX";
+        EXPECT_EQ(s, &dummy[i]);
+    }
+    EXPECT_EQ(queue.pop(), nullptr);
+}
+
+// Sequence near INT64_MAX: fill to capacity then drain
+TEST_F(SequenceWrapTest, FillDrainNearMax) {
+    int64_t near_max = INT64_MAX - 4;
+    InitQueueAtSequence(near_max);
+
+    int pushed = 0;
+    for (uint64_t i = 0; i < QUEUE_CAP; i++) {
+        if (queue.push(&dummy[i % 8])) pushed++;
+        else break;
+    }
+    EXPECT_GE(pushed, 1) << "Should push at least some items near max";
+
+    for (int i = 0; i < pushed; i++) {
+        EXPECT_NE(queue.pop(), nullptr);
+    }
+    EXPECT_EQ(queue.pop(), nullptr);
+}
+
+// Sequence near INT64_MAX: interleaved push/pop crossing the boundary
+TEST_F(SequenceWrapTest, InterleavedAcrossBoundary) {
+    int64_t near_max = INT64_MAX - 2;
+    InitQueueAtSequence(near_max);
+
+    // Each push/pop advances sequence by 1; after 5 cycles we cross INT64_MAX
+    for (int i = 0; i < 10; i++) {
+        ASSERT_TRUE(queue.push(&dummy[0])) << "Push " << i << " at sequence ~" << (near_max + i);
+        PTO2TaskSlotState *s = queue.pop();
+        ASSERT_NE(s, nullptr) << "Pop " << i;
+        EXPECT_EQ(s, &dummy[0]);
+    }
+}
+
+// Sequence at exactly INT64_MAX: single push/pop
+TEST_F(SequenceWrapTest, ExactlyAtInt64Max) {
+    InitQueueAtSequence(INT64_MAX);
+
+    ASSERT_TRUE(queue.push(&dummy[0]));
+    PTO2TaskSlotState *s = queue.pop();
+    EXPECT_EQ(s, &dummy[0]);
+}
+
+// ---------------------------------------------------------------------------
+// BUG-CANDIDATE: pop() fast-path uses unsigned comparison `d >= e`.
+//
+// When enqueue_pos crosses INT64_MAX (as uint64_t), the arithmetic is still
+// valid for unsigned because uint64 wraps modularly.  However, inside push()
+// and pop(), `static_cast<int64_t>(pos)` reinterprets bits: a pos of
+// 0x8000000000000000 becomes INT64_MIN.  The sequence counters undergo the
+// same reinterpretation, so diff calculations remain consistent.
+//
+// The REAL concern is pop()'s fast-path: `if (d >= e) return nullptr`.
+// After enough operations, enqueue_pos wraps around UINT64_MAX back to a
+// small number while dequeue_pos is still large.  At that point d > e
+// (unsigned), causing pop() to return nullptr even though items are queued.
+//
+// This test starts positions near UINT64_MAX to simulate the wrap scenario.
+// It documents that UINT64_MAX overflow in enqueue_pos/dequeue_pos would
+// break the fast-path, but this requires 2^64 operations -- practically
+// unreachable.  We test the INT64 boundary (2^63) which IS reachable in
+// extremely long-running graphs.
+// ---------------------------------------------------------------------------
+TEST_F(SequenceWrapTest, PushBatchThenPopAcrossInt64Boundary) {
+    // Start at INT64_MAX - 2 so that after 3 pushes, enqueue_pos crosses
+    // into the INT64_MIN region (as signed), while dequeue_pos stays at
+    // INT64_MAX - 2.
+    int64_t start = INT64_MAX - 2;
+    InitQueueAtSequence(start);
+
+    // Push 5 items: pos goes INT64_MAX-2, -1, MAX, MAX+1, MAX+2
+    for (int i = 0; i < 5; i++) {
+        ASSERT_TRUE(queue.push(&dummy[i])) << "Push " << i << " failed (pos would be ~INT64_MAX+" << (i - 2) << ")";
+    }
+
+    // Pop all 5: dequeue_pos starts at INT64_MAX-2, catches up.
+    // The fast-path `d >= e` compares unsigned values; since both grow
+    // monotonically as uint64_t, this stays correct across the signed
+    // boundary.
+    for (int i = 0; i < 5; i++) {
+        PTO2TaskSlotState *s = queue.pop();
+        ASSERT_NE(s, nullptr) << "Pop " << i << " returned nullptr -- fast-path may have misjudged empty";
+        EXPECT_EQ(s, &dummy[i]);
+    }
+    EXPECT_EQ(queue.pop(), nullptr);
+}
+
+// Concurrent push/pop near INT64_MAX boundary
+TEST_F(SequenceWrapTest, ConcurrentNearMax) {
+    static constexpr uint64_t BIG_CAP = 64;
+    PTO2ReadyQueueSlot big_slots[BIG_CAP];
+    PTO2ReadyQueue big_queue{};
+    int64_t start = INT64_MAX - 500;
+    test_ready_queue_init(&big_queue, big_slots, BIG_CAP, start);
+
+    constexpr int N = 1000;
+    std::vector<PTO2TaskSlotState> items(N);
+    for (int i = 0; i < N; i++)
+        items[i].fanin_count = i;
+
+    std::atomic<int> consumed{0};
+    std::atomic<bool> prod_done{false};
+
+    auto producer = [&]() {
+        for (int i = 0; i < N; i++) {
+            while (!big_queue.push(&items[i])) {}
+        }
+        prod_done.store(true, std::memory_order_release);
+    };
+
+    auto consumer = [&]() {
+        while (true) {
+            PTO2TaskSlotState *s = big_queue.pop();
+            if (s) {
+                consumed.fetch_add(1, std::memory_order_relaxed);
+            } else if (prod_done.load(std::memory_order_acquire)) {
+                while ((s = big_queue.pop()) != nullptr) {
+                    consumed.fetch_add(1, std::memory_order_relaxed);
+                }
+                break;
+            }
+        }
+    };
+
+    std::thread p(producer);
+    std::thread c1(consumer);
+    std::thread c2(consumer);
+    p.join();
+    c1.join();
+    c2.join();
+
+    EXPECT_EQ(consumed.load(), N);
+}
+
+// =============================================================================
+// 4. SharedMemory concurrent read/write of per-ring flow control
+// =============================================================================
+
+class SharedMemoryConcurrentTest : public ::testing::Test {
+protected:
+    PTO2SharedMemoryHandle *handle = nullptr;
+
+    void SetUp() override {
+        handle = pto2_sm_create(256, 4096);
+        ASSERT_NE(handle, nullptr);
+    }
+
+    void TearDown() override {
+        if (handle) {
+            pto2_sm_destroy(handle);
+            handle = nullptr;
+        }
+    }
+};
+
+// Concurrent current_task_index updates across different rings: no cross-ring interference
+TEST_F(SharedMemoryConcurrentTest, PerRingTaskIndexIsolation) {
+    constexpr int kIterations = 10000;
+
+    auto writer = [&](int ring) {
+        auto &fc = handle->header->rings[ring].fc;
+        for (int i = 1; i <= kIterations; i++) {
+            fc.current_task_index.store(static_cast<int32_t>(i), std::memory_order_release);
+        }
+    };
+
+    auto reader = [&](int ring, bool *saw_other_ring_value) {
+        auto &fc = handle->header->rings[ring].fc;
+        int32_t prev = 0;
+        for (int i = 0; i < kIterations; i++) {
+            int32_t val = fc.current_task_index.load(std::memory_order_acquire);
+            // Values should be monotonically increasing within a ring
+            if (val < prev) {
+                *saw_other_ring_value = true;
+            }
+            prev = val;
+        }
+    };
+
+    // Write to ring 0 and ring 1 concurrently; read from each
+    bool ring0_corrupted = false;
+    bool ring1_corrupted = false;
+
+    std::thread w0(writer, 0);
+    std::thread w1(writer, 1);
+    std::thread r0(reader, 0, &ring0_corrupted);
+    std::thread r1(reader, 1, &ring1_corrupted);
+
+    w0.join();
+    w1.join();
+    r0.join();
+    r1.join();
+
+    EXPECT_FALSE(ring0_corrupted) << "Ring 0 current_task_index should be monotonic";
+    EXPECT_FALSE(ring1_corrupted) << "Ring 1 current_task_index should be monotonic";
+
+    // Final values should be kIterations for each ring (independently)
+    EXPECT_EQ(handle->header->rings[0].fc.current_task_index.load(), static_cast<int32_t>(kIterations));
+    EXPECT_EQ(handle->header->rings[1].fc.current_task_index.load(), static_cast<int32_t>(kIterations));
+}
+
+// Concurrent current_task_index increment: simulate orchestrator publishing task IDs
+TEST_F(SharedMemoryConcurrentTest, TaskIndexAtomicIncrement) {
+    constexpr int kIncrements = 5000;
+    constexpr int kThreads = 4;
+
+    auto &fc = handle->header->rings[0].fc;
+    fc.current_task_index.store(0, std::memory_order_relaxed);
+
+    auto incrementer = [&]() {
+        for (int i = 0; i < kIncrements; i++) {
+            fc.current_task_index.fetch_add(1, std::memory_order_acq_rel);
+        }
+    };
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i < kThreads; i++) {
+        threads.emplace_back(incrementer);
+    }
+    for (auto &t : threads)
+        t.join();
+
+    EXPECT_EQ(fc.current_task_index.load(), kIncrements * kThreads) << "Concurrent increments should not lose updates";
+}
+
+// Concurrent orchestrator_done and error code write: first-writer-wins semantics
+TEST_F(SharedMemoryConcurrentTest, OrchestratorDoneRace) {
+    constexpr int kRounds = 500;
+
+    for (int round = 0; round < kRounds; round++) {
+        handle->header->orchestrator_done.store(0, std::memory_order_relaxed);
+        handle->header->orch_error_code.store(0, std::memory_order_relaxed);
+
+        std::atomic<int> winners{0};
+
+        auto try_set_done = [&](int32_t error_code) {
+            int32_t expected = 0;
+            if (handle->header->orchestrator_done.compare_exchange_strong(
+                    expected, 1, std::memory_order_acq_rel, std::memory_order_acquire
+                )) {
+                handle->header->orch_error_code.store(error_code, std::memory_order_release);
+                winners.fetch_add(1, std::memory_order_relaxed);
+            }
+        };
+
+        std::thread t1(try_set_done, 100);
+        std::thread t2(try_set_done, 200);
+        std::thread t3(try_set_done, 300);
+        t1.join();
+        t2.join();
+        t3.join();
+
+        EXPECT_EQ(winners.load(), 1) << "Round " << round << ": exactly one thread should win the CAS";
+        EXPECT_EQ(handle->header->orchestrator_done.load(), 1);
+        int32_t code = handle->header->orch_error_code.load();
+        EXPECT_TRUE(code == 100 || code == 200 || code == 300)
+            << "Error code should be from one of the competing threads";
+    }
+}
+
+// Concurrent last_task_alive advancement: only forward movement
+TEST_F(SharedMemoryConcurrentTest, LastTaskAliveMonotonic) {
+    constexpr int kIterations = 10000;
+    constexpr int kThreads = 4;
+
+    auto &fc = handle->header->rings[0].fc;
+    fc.last_task_alive.store(0, std::memory_order_relaxed);
+
+    auto advancer = [&](int id) {
+        for (int i = 0; i < kIterations; i++) {
+            // CAS-based forward-only update
+            int32_t desired = id * kIterations + i + 1;
+            int32_t current = fc.last_task_alive.load(std::memory_order_acquire);
+            while (current < desired) {
+                if (fc.last_task_alive.compare_exchange_weak(
+                        current, desired, std::memory_order_acq_rel, std::memory_order_acquire
+                    )) {
+                    break;
+                }
+            }
+        }
+    };
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i < kThreads; i++) {
+        threads.emplace_back(advancer, i);
+    }
+    for (auto &t : threads)
+        t.join();
+
+    int32_t final_val = fc.last_task_alive.load();
+    // Should be at least the max of any thread's last write
+    EXPECT_GE(final_val, kIterations) << "last_task_alive should have advanced";
+}
+
+// Validate after concurrent modifications still reports corruption correctly
+TEST_F(SharedMemoryConcurrentTest, ValidateAfterConcurrentWrites) {
+    constexpr int kIterations = 1000;
+
+    // Concurrent writers update current_task_index within valid range
+    auto writer = [&](int ring) {
+        auto &fc = handle->header->rings[ring].fc;
+        for (int i = 0; i < kIterations; i++) {
+            fc.current_task_index.store(static_cast<int32_t>(i % 256), std::memory_order_release);
+        }
+    };
+
+    std::thread w0(writer, 0);
+    std::thread w1(writer, 1);
+    std::thread w2(writer, 2);
+    std::thread w3(writer, 3);
+    w0.join();
+    w1.join();
+    w2.join();
+    w3.join();
+
+    EXPECT_TRUE(pto2_sm_validate(handle)) << "Valid current_task_index values should pass validation";
+
+    // Corrupt one ring and verify detection
+    handle->header->rings[2].fc.current_task_index.store(-1, std::memory_order_relaxed);
+    EXPECT_FALSE(pto2_sm_validate(handle)) << "Corrupted current_task_index should fail validation";
+}
+
+// Double destroy: pto2_sm_destroy(NULL) is safe
+TEST_F(SharedMemoryConcurrentTest, DestroyNullIsSafe) {
+    pto2_sm_destroy(nullptr);  // Should not crash
+}
diff --git a/tests/ut/cpp/pto2_a2a3/test_coupling.cpp b/tests/ut/cpp/pto2_a2a3/test_coupling.cpp
new file mode 100644
index 000000000..40893eda0
--- /dev/null
+++ b/tests/ut/cpp/pto2_a2a3/test_coupling.cpp
@@ -0,0 +1,780 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Architectural coupling detection tests for TMR (tensormap_and_ringbuffer) runtime.
+ *
+ * These tests verify whether components can operate in isolation or require
+ * the full system to be initialized. Failures indicate tight coupling that
+ * makes unit testing and independent evolution difficult.
+ *
+ * Test philosophy: FAIL = coupling defect detected (expected for some tests).
+ */
+
+#include <gtest/gtest.h>
+#include <atomic>
+#include <cstring>
+#include <cstdlib>
+
+#include "pto_orchestrator.h"
+#include "pto_scheduler.h"
+#include "pto_tensormap.h"
+#include "pto_ring_buffer.h"
+#include "pto_shared_memory.h"
+#include "pto_runtime2_types.h"
+#include "pto_orchestration_api.h"  // for make_tensor_external (Tensor ctor is private)
+#include "tensor.h"
+
+// =============================================================================
+// Helper: Full TMR system init/destroy (measures what's needed)
+// =============================================================================
+
+static constexpr uint64_t TEST_HEAP_SIZE = 65536;
+static constexpr int32_t TEST_WINDOW_SIZE = 64;
+
+struct TMRSystem {
+    PTO2SharedMemoryHandle *sm = nullptr;
+    PTO2SchedulerState sched{};
+    PTO2OrchestratorState orch{};
+    uint8_t *gm_heap = nullptr;
+    bool sm_ok = false, sched_ok = false, orch_ok = false;
+
+    bool init(uint64_t heap_size = TEST_HEAP_SIZE, int32_t window_size = TEST_WINDOW_SIZE) {
+        sm = pto2_sm_create(window_size, heap_size);
+        if (!sm) return false;
+        sm_ok = true;
+
+        gm_heap = (uint8_t *)calloc(PTO2_MAX_RING_DEPTH, heap_size);
+        if (!gm_heap) return false;
+
+        if (!pto2_scheduler_init(&sched, sm->header)) return false;
+        sched_ok = true;
+
+        if (!pto2_orchestrator_init(&orch, sm->header, gm_heap, heap_size, 256)) return false;
+        orch_ok = true;
+
+        pto2_orchestrator_set_scheduler(&orch, &sched);
+        return true;
+    }
+
+    void destroy() {
+        if (orch_ok) pto2_orchestrator_destroy(&orch);
+        if (sched_ok) pto2_scheduler_destroy(&sched);
+        if (gm_heap) {
+            free(gm_heap);
+            gm_heap = nullptr;
+        }
+        if (sm_ok) pto2_sm_destroy(sm);
+    }
+};
+
+// Helper: create a minimal Tensor for TensorMap operations.
+// Tensor's default constructor is private; route through make_tensor_external.
+// The `addr` argument is reinterpreted as a fake pointer -- the TensorMap only
+// hashes the address and compares shapes, it never dereferences the buffer.
+static Tensor make_test_tensor(uint64_t addr, uint32_t ndims = 1, uint32_t shape0 = 100) {
+    uint32_t shapes[RUNTIME_MAX_TENSOR_DIMS] = {};
+    shapes[0] = shape0;
+    for (uint32_t i = 1; i < ndims; i++)
+        shapes[i] = 1;
+    return make_tensor_external(
+        reinterpret_cast<void *>(addr), shapes, ndims, DataType::FLOAT32, /*manual_dep=*/false, /*version=*/0
+    );
+}
+
+// =============================================================================
+// Suite 1: ComponentIsolation
+// =============================================================================
+
+TEST(ComponentIsolation, TensorMapWithoutOrchPointer) {
+    // TensorMap has an `orch` pointer field (set by orchestrator_init).
+    // Can we use TensorMap for insert + lookup without setting it?
+    int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {16, 16, 16, 16};
+    PTO2TensorMap tmap{};
+    ASSERT_TRUE(tmap.init(256, 1024, window_sizes));
+
+    // orch pointer is never set -- TensorMap is used standalone
+
+    // Insert should work
+    Tensor t = make_test_tensor(0x1000);
+    PTO2TaskId tid = PTO2TaskId::make(0, 0);
+    tmap.insert(t, tid);
+
+    // Lookup should work
+    PTO2LookupResult result;
+    tmap.lookup(t, result);
+    EXPECT_GE(
+        result.count, 1
+    ) << "TensorMap lookup works without orch pointer -- orch is unused for core insert/lookup operations";
+
+    tmap.destroy();
+}
+
+TEST(ComponentIsolation, TensorMapWithZeroWindowSizes) {
+    // Passing zero window sizes to TensorMap::init() should be rejected,
+    // but there's no validation.
+    int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {0, 0, 0, 0};
+    PTO2TensorMap tmap{};
+    // init calls malloc(0 * sizeof(ptr)) for task_entry_heads -- implementation-defined
+    bool ok = tmap.init(256, 1024, window_sizes);
+
+    if (ok) {
+        // If init succeeded, inserting should be unsafe because
+        // mask = (0 - 1) = 0xFFFFFFFF -- slot index would be OOB.
+        // This proves lack of input validation.
+        // We can't safely test insert, just document the gap.
+        SUCCEED() << "Zero window_size accepted without validation: "
+                     "insert would compute OOB slot index";
+        tmap.destroy();
+    } else {
+        // malloc(0) returned NULL on this platform
+        SUCCEED() << "init correctly failed with zero window_size (malloc(0) returned NULL)";
+    }
+}
+
+TEST(ComponentIsolation, DepPoolReclaimNeedsScheduler) {
+    // DepListPool::reclaim() takes PTO2SchedulerState& and accesses
+    // sched.ring_sched_states[ring_id].get_slot_state_by_task_id(sm_last_task_alive - 1)
+    // This couples DepPool to Scheduler internals.
+    PTO2DepListEntry entries[64];
+    memset(entries, 0, sizeof(entries));
+    std::atomic<int32_t> error_code{0};
+    PTO2DepListPool pool;
+    pool.init(entries, 64, &error_code);
+
+    // Allocate some entries to make top > 0
+    for (int i = 0; i < 10; i++) {
+        pool.alloc();
+    }
+
+    // To call reclaim, we need a PTO2SharedMemoryRingHeader.
+    // Create a minimal SM to get a valid ring header.
+    PTO2SharedMemoryHandle *sm = pto2_sm_create(TEST_WINDOW_SIZE, TEST_HEAP_SIZE);
+    ASSERT_NE(sm, nullptr);
+
+    // reclaim with sm_last_task_alive=0 should be a no-op (guard: sm_last_task_alive > 0)
+    pool.reclaim(sm->header->rings[0], 0);
+    SUCCEED() << "reclaim with last_task_alive=0 is a no-op";
+
+    // reclaim with sm_last_task_alive=PTO2_DEP_POOL_CLEANUP_INTERVAL would access
+    // sched.ring_sched_states[0].slot_states[...] which is nullptr
+    // This demonstrates the coupling: DepPool cannot reclaim without valid Scheduler state
+    // We can't safely call reclaim(sched, 0, 64) because it would dereference nullptr
+
+    // Document the coupling via signature inspection
+    SUCCEED() << "DepPool::reclaim() requires PTO2SharedMemoryRingHeader& -- "
+                 "cannot reclaim without valid shared memory ring header";
+
+    pto2_sm_destroy(sm);
+}
+
+TEST(ComponentIsolation, DepPoolEnsureSpaceSignatureCoupling) {
+    // ensure_space() requires BOTH PTO2SchedulerState& AND PTO2RingFlowControl&
+    // This couples DepPool to Scheduler + SharedMemory simultaneously
+    PTO2DepListEntry entries[256];
+    memset(entries, 0, sizeof(entries));
+    std::atomic<int32_t> error_code{0};
+    PTO2DepListPool pool;
+    pool.init(entries, 256, &error_code);
+
+    // With enough space, ensure_space returns immediately without accessing ring header
+    PTO2SharedMemoryHandle *sm = pto2_sm_create(TEST_WINDOW_SIZE, TEST_HEAP_SIZE);
+    ASSERT_NE(sm, nullptr);
+
+    pool.ensure_space(sm->header->rings[0], 5);  // available() = 255 >= 5 -- no-op
+    EXPECT_GE(pool.available(), 5) << "ensure_space returns immediately when space sufficient, "
+                                      "but signature still requires PTO2SharedMemoryRingHeader reference";
+
+    pto2_sm_destroy(sm);
+}
+
+TEST(ComponentIsolation, SchedulerConsumedPathAccessesSM) {
+    // check_and_handle_consumed -> advance_ring_pointers requires valid SM header.
+    // Build a minimal slot that would trigger the consumed path.
+    TMRSystem sys;
+    ASSERT_TRUE(sys.init());
+
+    auto &rs = sys.sched.ring_sched_states[0];
+    PTO2TaskSlotState &slot = sys.sm->header->rings[0].get_slot_state_by_slot(0);
+
+    // Set up a task that appears consumed
+    slot.fanout_count = 1;
+    slot.fanout_refcount.store(1, std::memory_order_relaxed);
+    slot.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_relaxed);
+    slot.ring_id = 0;
+
+    // Provide a valid task descriptor so advance_ring_pointers won't crash
+    PTO2TaskDescriptor dummy_desc{};
+    dummy_desc.packed_buffer_base = nullptr;
+    dummy_desc.packed_buffer_end = nullptr;
+    slot.task = &dummy_desc;
+
+    // Set current_task_index to 1 so advance_ring_pointers scans slot 0
+    sys.sm->header->rings[0].fc.current_task_index.store(1, std::memory_order_relaxed);
+
+    // This should work with valid SM, proving SM is required
+    sys.sched.check_and_handle_consumed(slot);
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_CONSUMED)
+        << "check_and_handle_consumed works only with valid SM handle -- "
+           "Scheduler->SharedMemory tight coupling confirmed";
+
+    sys.destroy();
+}
+
+TEST(ComponentIsolation, OrchestratorInitWithoutSM) {
+    // pto2_orchestrator_init dereferences sm_header->rings[r].fc immediately.
+    // Passing nullptr should crash (no null-check).
+    PTO2OrchestratorState orch{};
+    uint8_t heap[1024];
+
+    EXPECT_DEATH(pto2_orchestrator_init(&orch, nullptr, heap, 1024), ".*")
+        << "Orchestrator init does not validate sm_header != nullptr";
+}
+
+TEST(ComponentIsolation, TaskSlotStateStandalone) {
+    // TaskSlotState should be the one type that can be operated independently.
+    // Manually drive the full state machine.
+    alignas(64) PTO2TaskSlotState slot{};
+    slot.fanin_count = 2;
+    slot.fanout_count = 1;
+    slot.fanin_refcount.store(0, std::memory_order_relaxed);
+    slot.fanout_refcount.store(0, std::memory_order_relaxed);
+    slot.task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
+
+    // PENDING -> READY: fanin_refcount reaches fanin_count
+    slot.fanin_refcount.fetch_add(1, std::memory_order_relaxed);
+    slot.fanin_refcount.fetch_add(1, std::memory_order_relaxed);
+    EXPECT_EQ(slot.fanin_refcount.load(), slot.fanin_count);
+
+    PTO2TaskState expected_pending = PTO2_TASK_PENDING;
+    EXPECT_TRUE(slot.task_state.compare_exchange_strong(expected_pending, PTO2_TASK_READY));
+
+    // READY -> RUNNING
+    PTO2TaskState expected_ready = PTO2_TASK_READY;
+    EXPECT_TRUE(slot.task_state.compare_exchange_strong(expected_ready, PTO2_TASK_RUNNING));
+
+    // RUNNING -> COMPLETED
+    slot.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_relaxed);
+
+    // COMPLETED -> CONSUMED: fanout_refcount reaches fanout_count
+    slot.fanout_refcount.fetch_add(1, std::memory_order_relaxed);
+    EXPECT_EQ(slot.fanout_refcount.load(), slot.fanout_count);
+
+    PTO2TaskState expected_completed = PTO2_TASK_COMPLETED;
+    EXPECT_TRUE(slot.task_state.compare_exchange_strong(expected_completed, PTO2_TASK_CONSUMED))
+        << "TaskSlotState can be fully driven standalone -- good isolation";
+}
+
+TEST(ComponentIsolation, HeapRingWithLocalAtomics) {
+    // The standalone PTO2HeapRing/pto2_heap_ring_init API has been consolidated
+    // into PTO2TaskAllocator, which couples the heap and the task ring. There is
+    // no longer a way to exercise heap allocation in isolation with just local
+    // atomics -- you need a fully initialized allocator backed by SM pointers.
+    //
+    // This test is preserved as a documentation of the tightening of that
+    // coupling: heap alloc can no longer run independently of the task ring.
+    SUCCEED() << "PTO2HeapRing/pto2_heap_ring_init removed -- heap allocation is "
+                 "now embedded in PTO2TaskAllocator, which requires a task ring "
+                 "and SM-backed atomics. Heap allocation is no longer isolable.";
+}
+
+// =============================================================================
+// Suite 2: InitializationOrder
+// =============================================================================
+
+TEST(InitializationOrder, TensorMapInitWithGarbageWindowSizes) {
+    // If SM header is not initialized before TensorMap::init_default(),
+    // garbage window_sizes are read. Simulate this with large values.
+    int32_t garbage_sizes[PTO2_MAX_RING_DEPTH] = {-1, -1, -1, -1};
+    PTO2TensorMap tmap{};
+
+    // malloc(-1 * sizeof(ptr)) = malloc(huge) -- should fail
+    bool ok = tmap.init(256, 1024, garbage_sizes);
+    EXPECT_FALSE(ok) << "TensorMap::init with negative window_sizes should fail on malloc, "
+                        "but no explicit validation rejects negative values before malloc";
+
+    if (ok) tmap.destroy();
+}
+
+TEST(InitializationOrder, SchedulerInitWithZeroWindowSize) {
+    // If SM has task_window_size=0, scheduler creates arrays of size 0.
+    PTO2SharedMemoryHandle *sm = pto2_sm_create(0, TEST_HEAP_SIZE);
+
+    if (sm == nullptr) {
+        // pto2_sm_create rejects 0 window -- good validation
+        SUCCEED() << "pto2_sm_create rejects window_size=0";
+        return;
+    }
+
+    PTO2SchedulerState sched{};
+    uint8_t heap[TEST_HEAP_SIZE * PTO2_MAX_RING_DEPTH]{};
+    (void)heap;
+
+    bool ok = pto2_scheduler_init(&sched, sm->header);
+    if (ok) {
+        // task_window_mask = 0 - 1 = -1 (wraps to max uint)
+        // get_slot_state_by_task_id(0) would access slot_states[0 & (-1)] = slot_states[0]
+        // But slot_states was allocated with new PTO2TaskSlotState[0] -- zero-length!
+        EXPECT_EQ(sm->header->rings[0].task_window_size, 0u)
+            << "Zero window_size accepted: slot_states[0] is zero-length allocation, "
+               "any access is UB";
+        pto2_scheduler_destroy(&sched);
+    }
+
+    pto2_sm_destroy(sm);
+}
+
+TEST(InitializationOrder, OrchestratorDoubleInit) {
+    // Calling init twice without destroy leaks all first-init allocations.
+    TMRSystem sys;
+    ASSERT_TRUE(sys.init());
+
+    // Re-init without destroy -- old allocations are leaked
+    uint8_t extra_heap[TEST_HEAP_SIZE * PTO2_MAX_RING_DEPTH]{};
+    bool ok = pto2_orchestrator_init(&sys.orch, sys.sm->header, extra_heap, TEST_HEAP_SIZE, 256);
+    EXPECT_TRUE(ok) << "Double init succeeds -- no guard against re-initialization. "
+                       "First init's allocations are leaked";
+
+    // Clean up the second init
+    pto2_orchestrator_destroy(&sys.orch);
+
+    // First init's memory is leaked -- we can't free it anymore
+    // This is a documentation test: no re-init guard exists
+    sys.orch_ok = false;  // prevent double destroy
+    sys.destroy();
+}
+
+TEST(InitializationOrder, OrchestratorBeforeScheduler) {
+    // Init orchestrator without setting scheduler. scope_begin + scope_end should
+    // degrade gracefully (skip dependency tracking).
+    PTO2SharedMemoryHandle *sm = pto2_sm_create(TEST_WINDOW_SIZE, TEST_HEAP_SIZE);
+    ASSERT_NE(sm, nullptr);
+
+    uint8_t *heap = (uint8_t *)calloc(PTO2_MAX_RING_DEPTH, TEST_HEAP_SIZE);
+    ASSERT_NE(heap, nullptr);
+
+    PTO2OrchestratorState orch{};
+    ASSERT_TRUE(pto2_orchestrator_init(&orch, sm->header, heap, TEST_HEAP_SIZE, 256));
+
+    // scheduler is not set -- scope_begin/scope_end should not crash
+    pto2_scope_begin(&orch);
+    pto2_scope_end(&orch);
+    SUCCEED() << "scope_begin + scope_end work without scheduler (no crash). "
+                 "Tasks submitted in this scope have no dependency tracking.";
+
+    pto2_orchestrator_destroy(&orch);
+    free(heap);
+    pto2_sm_destroy(sm);
+}
+
+// =============================================================================
+// Suite 3: CrossComponentContract
+// =============================================================================
+
+TEST(CrossComponentContract, WindowSizeMismatch) {
+    // After the PTO2SharedMemoryRingHeader consolidation (#622), both scheduler
+    // and orchestrator read window_size from the same SM ring header pointer.
+    // Verify via the SM header: the single source of truth.
+    PTO2SharedMemoryHandle *sm = pto2_sm_create(TEST_WINDOW_SIZE, TEST_HEAP_SIZE);
+    ASSERT_NE(sm, nullptr);
+
+    uint8_t *heap = (uint8_t *)calloc(PTO2_MAX_RING_DEPTH, TEST_HEAP_SIZE);
+    ASSERT_NE(heap, nullptr);
+
+    // Initialize scheduler and orchestrator
+    PTO2SchedulerState sched{};
+    ASSERT_TRUE(pto2_scheduler_init(&sched, sm->header));
+
+    PTO2OrchestratorState orch{};
+    ASSERT_TRUE(pto2_orchestrator_init(&orch, sm->header, heap, TEST_HEAP_SIZE, 256));
+
+    // Both read from the same SM header -- verify the header value is correct
+    EXPECT_EQ(sm->header->rings[0].task_window_size, (uint64_t)TEST_WINDOW_SIZE)
+        << "SM ring header holds the authoritative window_size";
+
+    // Mutate SM header -- both components see the new value because they
+    // share the same ring header pointer
+    sm->header->rings[0].task_window_size = TEST_WINDOW_SIZE * 2;
+    EXPECT_EQ(sm->header->rings[0].task_window_size, (uint64_t)(TEST_WINDOW_SIZE * 2))
+        << "After RingHeader consolidation, mutation is visible to all components "
+           "through the shared ring header pointer -- independent-caching mismatch eliminated";
+
+    pto2_orchestrator_destroy(&orch);
+    pto2_scheduler_destroy(&sched);
+    free(heap);
+    pto2_sm_destroy(sm);
+}
+
+TEST(CrossComponentContract, FanoutCountManipulation) {
+    // fanout_count is set by orchestrator (+1 for scope), checked by scheduler.
+    // If we bypass the +1 initialization, check_and_handle_consumed fires immediately.
+    TMRSystem sys;
+    ASSERT_TRUE(sys.init());
+
+    auto &rs = sys.sched.ring_sched_states[0];
+    PTO2TaskSlotState &slot = sys.sm->header->rings[0].get_slot_state_by_slot(0);
+
+    PTO2TaskDescriptor dummy_desc{};
+    dummy_desc.packed_buffer_base = nullptr;
+    dummy_desc.packed_buffer_end = nullptr;
+    slot.task = &dummy_desc;
+    slot.ring_id = 0;
+
+    // Normal init: orchestrator sets fanout_count = 1 (scope ref)
+    // Here we bypass: set fanout_count = 0 directly
+    slot.fanout_count = 0;
+    slot.fanout_refcount.store(0, std::memory_order_relaxed);
+    slot.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_relaxed);
+
+    sys.sm->header->rings[0].fc.current_task_index.store(1, std::memory_order_relaxed);
+
+    // check_and_handle_consumed: fanout_refcount(0) == fanout_count(0) -> true -> CONSUMED
+    sys.sched.check_and_handle_consumed(slot);
+
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_CONSUMED)
+        << "fanout_count=0 causes premature CONSUMED transition -- "
+           "scheduler trusts orchestrator's fanout_count without validation";
+}
+
+TEST(CrossComponentContract, HeapTailBeyondTop) {
+    // Previously tested PTO2HeapRing::pto2_heap_ring_try_alloc with manually
+    // constructed top/tail atomics. PTO2HeapRing no longer exists as a
+    // free-standing component -- heap state (top/tail) is now encapsulated in
+    // PTO2TaskAllocator as local integers derived from task descriptors, not
+    // from externally writable atomics. An invalid tail>top state cannot be
+    // synthesized without a full allocator + scheduler setup, so this
+    // coupling-contract scenario is no longer reachable from a unit test.
+    SUCCEED() << "PTO2HeapRing removed; heap tail/top are now internal to "
+                 "PTO2TaskAllocator and derived from consumed task descriptors. "
+                 "No external atomic to corrupt -- this specific invariant is "
+                 "enforced by construction rather than by validation.";
+}
+
+TEST(CrossComponentContract, ActiveMaskZero) {
+    // active_mask=0 should never happen (orchestrator has always_assert).
+    // But scheduler's release_fanin_and_check_ready has no such guard.
+    alignas(64) PTO2TaskSlotState slot{};
+    slot.active_mask = 0;  // Invalid -- no subtask active
+    slot.fanin_count = 1;
+    slot.fanin_refcount.store(0, std::memory_order_relaxed);
+    slot.task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
+
+    PTO2ResourceShape shape = pto2_active_mask_to_shape(0);
+    // With mask=0: core_mask=0, popcount=0, no AIC bit -> falls through to AIV.
+    // The enum has been simplified to {AIC, AIV, MIX}; there is no longer a
+    // distinct AIV_X2 shape (multi-AIV tasks are all MIX).
+    EXPECT_EQ(static_cast<int>(shape), static_cast<int>(PTO2ResourceShape::AIV))
+        << "active_mask=0 maps to AIV -- incorrect shape routing. "
+           "Orchestrator guards with always_assert, but scheduler does not validate";
+}
+
+TEST(CrossComponentContract, TaskDescriptorNullInConsumedSlot) {
+    // Historically advance_ring_pointers dereferenced slot.task->packed_buffer_end
+    // to drive heap reclamation from the last consumed task. Heap reclamation
+    // has since moved into PTO2TaskAllocator::update_heap_tail (reached by the
+    // orchestrator on allocation), so advance_ring_pointers no longer touches
+    // slot.task at all -- it only walks task_state. The coupling this test was
+    // designed to surface has been removed by construction.
+    TMRSystem sys;
+    ASSERT_TRUE(sys.init());
+
+    auto &rs = sys.sched.ring_sched_states[0];
+    PTO2TaskSlotState &slot = sys.sm->header->rings[0].get_slot_state_by_slot(0);
+
+    // Mark as CONSUMED but leave task pointer as nullptr
+    slot.task_state.store(PTO2_TASK_CONSUMED, std::memory_order_relaxed);
+    slot.task = nullptr;  // Not initialized
+    slot.ring_id = 0;
+
+    sys.sm->header->rings[0].fc.current_task_index.store(1, std::memory_order_relaxed);
+
+    // Should no longer crash: advance_ring_pointers now only reads task_state.
+    rs.advance_ring_pointers();
+    EXPECT_EQ(rs.last_task_alive, 1) << "advance_ring_pointers no longer dereferences slot.task -- "
+                                        "scheduler/orchestrator heap-reclamation coupling removed";
+
+    sys.destroy();
+}
+
+// =============================================================================
+// Suite 4: StateLeakage
+// =============================================================================
+
+TEST(StateLeakage, HeapErrorCodeInvisibleToScheduler) {
+    // Orchestrator sets orch_error_code on fatal error.
+    // Scheduler's hot path does NOT check this error code.
+    TMRSystem sys;
+    ASSERT_TRUE(sys.init());
+
+    // Simulate orchestrator setting fatal error
+    sys.sm->header->orch_error_code.store(PTO2_ERROR_HEAP_RING_DEADLOCK, std::memory_order_release);
+
+    // Scheduler operations continue despite error:
+    // push to ready queue
+    auto &rs = sys.sched.ring_sched_states[0];
+    PTO2TaskSlotState &slot = sys.sm->header->rings[0].get_slot_state_by_slot(0);
+    slot.active_mask = PTO2_SUBTASK_MASK_AIV0;
+    PTO2ResourceShape shape = pto2_active_mask_to_shape(slot.active_mask);
+
+    bool pushed = sys.sched.ready_queues[static_cast<int>(shape)].push(&slot);
+    EXPECT_TRUE(pushed);
+
+    // pop from ready queue
+    PTO2TaskSlotState *popped = sys.sched.ready_queues[static_cast<int>(shape)].pop();
+    EXPECT_EQ(popped, &slot) << "Scheduler continues normal operation after orchestrator fatal error -- "
+                                "orch_error_code is one-directional (orch->host), invisible to scheduler hot path";
+
+    sys.destroy();
+}
+
+TEST(StateLeakage, HeadOfLineBlocking) {
+    // advance_ring_pointers scans linearly: stops at first non-CONSUMED slot.
+    // One incomplete task blocks reclamation of all subsequent CONSUMED tasks.
+    TMRSystem sys;
+    ASSERT_TRUE(sys.init());
+
+    auto &rs = sys.sched.ring_sched_states[0];
+    PTO2TaskDescriptor descs[3]{};
+    descs[0].packed_buffer_end = nullptr;
+    descs[1].packed_buffer_end = nullptr;
+    descs[2].packed_buffer_end = nullptr;
+
+    // Task 0: CONSUMED
+    PTO2TaskSlotState &slot0 = sys.sm->header->rings[0].get_slot_state_by_slot(0);
+    slot0.task_state.store(PTO2_TASK_CONSUMED, std::memory_order_relaxed);
+    slot0.task = &descs[0];
+
+    // Task 1: COMPLETED (NOT consumed -- fanout incomplete)
+    PTO2TaskSlotState &slot1 = sys.sm->header->rings[0].get_slot_state_by_slot(1);
+    slot1.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_relaxed);
+    slot1.task = &descs[1];
+
+    // Task 2: CONSUMED
+    PTO2TaskSlotState &slot2 = sys.sm->header->rings[0].get_slot_state_by_slot(2);
+    slot2.task_state.store(PTO2_TASK_CONSUMED, std::memory_order_relaxed);
+    slot2.task = &descs[2];
+
+    sys.sm->header->rings[0].fc.current_task_index.store(3, std::memory_order_relaxed);
+
+    rs.advance_ring_pointers();
+
+    // last_task_alive should stop at task 1 (COMPLETED, not CONSUMED)
+    EXPECT_EQ(rs.last_task_alive, 1) << "Head-of-line blocking: task 1 (COMPLETED) blocks reclamation of "
+                                        "task 2 (CONSUMED). Linear scan design couples reclamation rate "
+                                        "to the slowest consumer in the ring.";
+
+    sys.destroy();
+}
+
+TEST(StateLeakage, TensorMapCleanupInterval) {
+    // TensorMap cleanup is triggered every PTO2_TENSORMAP_CLEANUP_INTERVAL tasks.
+    // Between cleanups, stale entries accumulate in bucket chains, degrading lookup.
+    int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {256, 256, 256, 256};
+    PTO2TensorMap tmap{};
+    ASSERT_TRUE(tmap.init(256, 4096, window_sizes));
+
+    // Insert entries for tasks 0..99 (all same address = same bucket)
+    for (int i = 0; i < 100; i++) {
+        Tensor t = make_test_tensor(0x2000);
+        PTO2TaskId tid = PTO2TaskId::make(0, i);
+        tmap.insert(t, tid);
+    }
+
+    // Advance last_task_alive to 80 -- tasks 0..79 are stale
+    tmap.sync_validity(0, 80);
+
+    // Lookup must traverse all 100 entries (80 stale + 20 valid)
+    // because cleanup hasn't been triggered yet (need sync_tensormap, not just sync_validity)
+    PTO2LookupResult result;
+    Tensor query = make_test_tensor(0x2000);
+    tmap.lookup(query, result);
+
+    // Should find entries from tasks 80..99 = 20 valid
+    EXPECT_EQ(result.count, 16) << "Lookup result capped at PTO2_LOOKUP_MAX_RESULTS=16, but stale entries "
+                                   "still slow traversal. Cleanup interval ("
+                                << PTO2_TENSORMAP_CLEANUP_INTERVAL
+                                << " tasks) couples TensorMap performance to scheduler's CONSUMED advancement rate";
+
+    tmap.destroy();
+}
+
+TEST(StateLeakage, SubtaskMaskProtocol) {
+    // active_mask bits (AIC=0x1, AIV0=0x2, AIV1=0x4) are set by orchestrator
+    // and checked by scheduler's on_subtask_complete. There's no shared enum
+    // enforcing consistency -- just implicit agreement on bit positions.
+
+    // Orchestrator normalizes aiv1-only to aiv0:
+    // If only aiv1 set (0x4), it moves to aiv0 (0x2).
+    // Scheduler uses SubtaskSlot enum (AIC=0, AIV0=1, AIV1=2) for done_bit.
+
+    // Verify the normalization creates an implicit contract:
+    uint8_t mask_aiv1_only = PTO2_SUBTASK_MASK_AIV1;  // 0x4
+    // After orchestrator normalization: becomes PTO2_SUBTASK_MASK_AIV0 = 0x2
+    uint8_t normalized = PTO2_SUBTASK_MASK_AIV0;  // aiv1 moved to aiv0
+
+    // Scheduler completion path: on_subtask_complete with AIV0 slot sets bit 1
+    uint8_t done_bit = (1u << static_cast<uint8_t>(PTO2SubtaskSlot::AIV0));
+    EXPECT_EQ(done_bit, PTO2_SUBTASK_MASK_AIV0);
+
+    // But if scheduler receives completion for AIV1 slot (the physical source),
+    // it would set bit 2, which doesn't match normalized mask 0x2
+    uint8_t wrong_done_bit = (1u << static_cast<uint8_t>(PTO2SubtaskSlot::AIV1));
+    EXPECT_NE(wrong_done_bit, normalized)
+        << "Subtask mask protocol: orchestrator normalizes aiv1->aiv0 (mask 0x4->0x2), "
+           "but scheduler must dispatch to AIV0 slot (not AIV1). "
+           "If scheduler signals AIV1 completion, done_mask (0x4) != active_mask (0x2) -- "
+           "task never completes. No compile-time enforcement exists.";
+}
+
+// =============================================================================
+// Suite 5: CompileTimeCoupling
+// =============================================================================
+
+TEST(CompileTimeCoupling, OrchestratorInitDestroyCycle) {
+    // Orchestrator embeds rings, TensorMap, scope stack -- a large composite.
+    // Verify it can be initialized and destroyed cleanly multiple times,
+    // proving all sub-components are properly managed.
+    for (int cycle = 0; cycle < 3; cycle++) {
+        TMRSystem sys;
+        ASSERT_TRUE(sys.init()) << "Init cycle " << cycle;
+        sys.destroy();
+    }
+    SUCCEED() << "OrchestratorState init/destroy is clean across multiple cycles";
+}
+
+TEST(CompileTimeCoupling, MaxRingDepthPropagation) {
+    // PTO2_MAX_RING_DEPTH=4 is used across multiple components.
+    // Verify that the system initializes and operates correctly for all rings
+    // up to PTO2_MAX_RING_DEPTH, without probing internal array sizes.
+
+    // static_asserts on array sizes at the struct level are compile-time safety
+    // nets that belong in production headers, not in behavioral tests.
+    // This test verifies the functional consequence: all ring indices work.
+    PTO2SharedMemoryHandle *sm = pto2_sm_create(TEST_WINDOW_SIZE, TEST_HEAP_SIZE);
+    ASSERT_NE(sm, nullptr);
+
+    // Verify all rings are accessible through SM header
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        EXPECT_EQ(sm->header->rings[r].task_window_size, (uint64_t)TEST_WINDOW_SIZE)
+            << "Ring " << r << " should be initialized with correct window_size";
+    }
+
+    // TensorMap should accept inserts and lookups on all rings
+    int32_t window_sizes[PTO2_MAX_RING_DEPTH];
+    for (int i = 0; i < PTO2_MAX_RING_DEPTH; i++)
+        window_sizes[i] = TEST_WINDOW_SIZE;
+    PTO2TensorMap tmap{};
+    ASSERT_TRUE(tmap.init(256, 1024, window_sizes));
+
+    Tensor t = make_test_tensor(0x1000);
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        tmap.insert(t, PTO2TaskId::make(r, 0));
+    }
+
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(t, result);
+    EXPECT_EQ(result.count, PTO2_MAX_RING_DEPTH)
+        << "TensorMap supports inserts on all " << PTO2_MAX_RING_DEPTH << " rings";
+
+    tmap.destroy();
+    pto2_sm_destroy(sm);
+}
+
+TEST(CompileTimeCoupling, WindowSizeConsistencyAfterInit) {
+    // Verify that after full system init, all components operate correctly
+    // with the configured window_size by exercising the public API.
+    TMRSystem sys;
+    ASSERT_TRUE(sys.init());
+
+    // The authoritative window_size lives in the SM ring header
+    uint64_t expected_window = sys.sm->header->rings[0].task_window_size;
+    EXPECT_EQ(expected_window, (uint64_t)TEST_WINDOW_SIZE);
+
+    // Verify functional consistency: insert tasks up to window_size
+    // and confirm TensorMap, Orchestrator, and Scheduler all work correctly.
+    Tensor t = make_test_tensor(0x1000);
+    pto2_scope_begin(&sys.orch);
+
+    // Insert a tensor -- exercises Orchestrator + TensorMap
+    sys.orch.tensor_map.insert(t, PTO2TaskId::make(0, 0));
+
+    // Lookup -- exercises TensorMap with its window_size
+    PTO2LookupResult result;
+    result.count = 0;
+    sys.orch.tensor_map.lookup(t, result);
+    EXPECT_EQ(result.count, 1) << "TensorMap insert+lookup works with configured window_size";
+
+    pto2_scope_end(&sys.orch);
+
+    sys.destroy();
+}
+
+TEST(CompileTimeCoupling, TaskSlotStateLifecycleStandalone) {
+    // Verify TaskSlotState can be fully driven through its state machine
+    // without any other component -- proving it is the nexus type that
+    // both orchestrator and scheduler operate on.
+    alignas(64) PTO2TaskSlotState slot{};
+    slot.fanin_count = 2;
+    slot.fanout_count = 1;
+    slot.fanin_refcount.store(0, std::memory_order_relaxed);
+    slot.fanout_refcount.store(0, std::memory_order_relaxed);
+    slot.task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
+
+    // Drive full lifecycle: PENDING -> READY -> RUNNING -> COMPLETED -> CONSUMED
+    slot.fanin_refcount.fetch_add(1);
+    slot.fanin_refcount.fetch_add(1);
+    EXPECT_EQ(slot.fanin_refcount.load(), slot.fanin_count);
+
+    PTO2TaskState expected = PTO2_TASK_PENDING;
+    EXPECT_TRUE(slot.task_state.compare_exchange_strong(expected, PTO2_TASK_READY));
+
+    expected = PTO2_TASK_READY;
+    EXPECT_TRUE(slot.task_state.compare_exchange_strong(expected, PTO2_TASK_RUNNING));
+
+    slot.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_relaxed);
+
+    slot.fanout_refcount.fetch_add(1);
+    EXPECT_EQ(slot.fanout_refcount.load(), slot.fanout_count);
+
+    expected = PTO2_TASK_COMPLETED;
+    EXPECT_TRUE(slot.task_state.compare_exchange_strong(expected, PTO2_TASK_CONSUMED))
+        << "TaskSlotState can be fully driven standalone -- references types from "
+           "both orchestrator and scheduler domains but is independently operable";
+}
+
+TEST(CompileTimeCoupling, ReadyQueueAllShapesUsable) {
+    // PTO2_NUM_RESOURCE_SHAPES ready queues exist (one per shape).
+    // Verify all can be initialized and used for push/pop.
+    for (int s = 0; s < PTO2_NUM_RESOURCE_SHAPES; s++) {
+        PTO2ReadyQueue queue{};
+        ASSERT_TRUE(pto2_ready_queue_init(&queue, 16)) << "Shape " << s << " queue init failed";
+
+        PTO2TaskSlotState item{};
+        EXPECT_TRUE(queue.push(&item));
+        EXPECT_EQ(queue.pop(), &item);
+
+        pto2_ready_queue_destroy(&queue);
+    }
+}
+
+TEST(CompileTimeCoupling, LinkDependencyChain) {
+    // This test file links 5 runtime .cpp files:
+    // pto_orchestrator.cpp, pto_tensormap.cpp, pto_shared_memory.cpp,
+    // pto_ring_buffer.cpp, pto_scheduler.cpp
+    // This is because pto_tensormap.cpp includes pto_orchestrator.h (circular),
+    // which includes pto_scheduler.h, pto_ring_buffer.h, pto_shared_memory.h.
+    // Cannot compile TensorMap without linking the full runtime.
+    SUCCEED() << "test_coupling links 5 runtime .cpp files. "
+                 "Root cause: pto_tensormap.cpp #includes pto_orchestrator.h "
+                 "for sync_tensormap, creating a circular compile-unit dependency. "
+                 "This forces all tests that include TensorMap to also link "
+                 "Orchestrator, Scheduler, RingBuffer, and SharedMemory.";
+}
diff --git a/tests/ut/cpp/pto2_a2a3/test_coupling_stub.cpp b/tests/ut/cpp/pto2_a2a3/test_coupling_stub.cpp
new file mode 100644
index 000000000..022f4da2b
--- /dev/null
+++ b/tests/ut/cpp/pto2_a2a3/test_coupling_stub.cpp
@@ -0,0 +1,727 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Stub-based architectural coupling detection tests.
+ *
+ * This file deliberately excludes pto_orchestrator.cpp from the link.
+ * If it compiles and links successfully, that PROVES TensorMap + Scheduler +
+ * RingBuffer + SharedMemory can be used without the Orchestrator at link time.
+ *
+ * Key distinction probed here:
+ *   Link-time coupling    -- .o file has UND symbols pointing to another component
+ *   Compile-time coupling -- .cpp includes another component's header (type access)
+ *   Type-level coupling   -- function signature uses another component's struct type,
+ *                           forcing full include even if only a pointer is stored
+ *
+ * Test philosophy: document coupling depth precisely using stubs.
+ * FAIL = a coupling contract that the src violates or makes harder than necessary.
+ */
+
+#include <gtest/gtest.h>
+#include <atomic>
+#include <cstring>
+#include <cstdlib>
+#include <new>
+
+#include "pto_ring_buffer.h"
+#include "pto_scheduler.h"
+#include "pto_shared_memory.h"
+#include "pto_tensormap.h"
+#include "pto_runtime2_types.h"
+#include "tensor.h"
+// Only for make_tensor_external (inline, no link dependency on orchestrator.cpp).
+#include "pto_orchestration_api.h"
+
+// =============================================================================
+// Shared helpers
+// =============================================================================
+
+static constexpr uint64_t SH = 65536;  // heap size for sm_create
+static constexpr int32_t SW = 64;      // task window size
+
+// Minimal stub: allocate only the fields reclaim() reads.
+// Fields task_window_size/mask/slot_states now live on PTO2SharedMemoryRingHeader,
+// so we build a fake ring header on the heap.
+struct MinimalSchedStub {
+    PTO2SharedMemoryRingHeader ring_header{};
+    PTO2TaskSlotState *slot_array = nullptr;
+    static constexpr int32_t WINDOW = 64;
+
+    bool init(uint8_t /*ring_id*/ = 0) {
+        memset(&ring_header, 0, sizeof(ring_header));
+        slot_array = new (std::nothrow) PTO2TaskSlotState[WINDOW]{};
+        if (!slot_array) return false;
+        ring_header.slot_states = slot_array;
+        ring_header.task_window_size = WINDOW;
+        ring_header.task_window_mask = WINDOW - 1;
+        return true;
+    }
+
+    void destroy() {
+        delete[] slot_array;
+        slot_array = nullptr;
+    }
+};
+
+// Minimal pool helper: 512-entry DepListPool.
+struct SmallPool {
+    PTO2DepListEntry entries[512];
+    std::atomic<int32_t> error_code{0};
+    PTO2DepListPool pool;
+
+    void init() {
+        memset(entries, 0, sizeof(entries));
+        pool.init(entries, 512, &error_code);
+    }
+    int alloc_n(int n) {
+        int last = 0;
+        for (int i = 0; i < n; i++) {
+            auto *e = pool.alloc();
+            if (e) last = i + 1;
+        }
+        return last;
+    }
+};
+
+static Tensor make_tensor(uint64_t addr, uint32_t ndims = 1, uint32_t shape0 = 100) {
+    // Use make_tensor_external (inline header helper) since Tensor default
+    // constructor is private. The helper does not create any link-time
+    // dependency on pto_orchestrator.cpp.
+    uint32_t shapes[RUNTIME_MAX_TENSOR_DIMS] = {};
+    shapes[0] = shape0;
+    for (uint32_t i = 1; i < ndims; ++i)
+        shapes[i] = 1;
+    return make_tensor_external(
+        reinterpret_cast<void *>(static_cast<uintptr_t>(addr)), shapes, ndims, DataType::FLOAT32, /*manual_dep=*/false,
+        /*version=*/0
+    );
+}
+
+// =============================================================================
+// Suite 1: DepPoolStubIsolation
+// =============================================================================
+
+// sm_last_task_alive < PTO2_DEP_POOL_CLEANUP_INTERVAL: reclaim is a no-op.
+// A zero-initialized PTO2SharedMemoryRingHeader (slot_states=nullptr) must not crash.
+TEST(DepPoolStubIsolation, ReclaimBelowInterval_NeverAccessesScheduler) {
+    SmallPool sp;
+    sp.init();
+    sp.alloc_n(100);
+
+    // Capture used count BEFORE reclaim to compare after
+    int32_t used_before = sp.pool.used();
+
+    // Zero-init stub -- slot_states is nullptr
+    PTO2SharedMemoryRingHeader ring_hdr{};
+    memset(&ring_hdr, 0, sizeof(ring_hdr));
+
+    // sm_last_task_alive = interval - 1 -> guard `>= interval` is false -> no-op
+    int32_t below = PTO2_DEP_POOL_CLEANUP_INTERVAL - 1;
+    sp.pool.reclaim(ring_hdr, below);
+
+    // Pool unchanged -- reclaim was a no-op
+    EXPECT_EQ(sp.pool.used(), used_before)
+        << "reclaim() is a no-op when sm_last_task_alive < interval. "
+           "A fully zero-initialized (nullptr slot_states) PTO2SharedMemoryRingHeader "
+           "is safe to pass -- the struct is never touched.";
+}
+
+// sm_last_task_alive == PTO2_DEP_POOL_CLEANUP_INTERVAL: reclaim reads exactly
+//   ring_header.slot_states[(interval-1) & mask].dep_pool_mark
+// Stub provides only those three values; all other fields remain zero.
+TEST(DepPoolStubIsolation, ReclaimAtInterval_OnlyNeedsSlotArrayAndMask) {
+    SmallPool sp;
+    sp.init();
+    sp.alloc_n(100);  // top = 100, tail = 0
+
+    MinimalSchedStub stub;
+    ASSERT_TRUE(stub.init(0));
+
+    // Set dep_pool_mark in the slot reclaim() will read
+    int32_t sm_last = PTO2_DEP_POOL_CLEANUP_INTERVAL;         // e.g. 64
+    int32_t target_slot = (sm_last - 1) & (stub.WINDOW - 1);  // (63) & 63 = 63
+    stub.slot_array[target_slot].dep_pool_mark = 50;
+
+    sp.pool.reclaim(stub.ring_header, sm_last);
+
+    // reclaim should advance pool tail so used count drops (from 100 to 51)
+    EXPECT_EQ(sp.pool.used(), 51) << "reclaim() reads EXACTLY THREE values from PTO2SharedMemoryRingHeader:\n"
+                                     "  1. slot_states  (the pointer)\n"
+                                     "  2. task_window_mask\n"
+                                     "  3. slot_states[(sm_last-1) & mask].dep_pool_mark\n"
+                                     "All other fields of PTO2SharedMemoryRingHeader are unused.";
+
+    stub.destroy();
+}
+
+// ensure_space() returns immediately when available() >= needed.
+// PTO2SharedMemoryRingHeader is never accessed in the fast path.
+TEST(DepPoolStubIsolation, EnsureSpaceWithSufficientCapacity_NoSchedulerAccess) {
+    SmallPool sp;
+    sp.init();
+    // Pool is empty: available() = capacity - 1 = 511 >> needed = 5
+
+    PTO2SharedMemoryRingHeader ring_hdr{};
+    memset(&ring_hdr, 0, sizeof(ring_hdr));  // slot_states = nullptr (would crash if accessed)
+
+    // Should return immediately without touching ring_hdr internals
+    sp.pool.ensure_space(ring_hdr, 5);
+
+    EXPECT_GE(
+        sp.pool.available(), 5
+    ) << "ensure_space() exits immediately when available() >= needed. "
+         "Zero-initialized ring header (slot_states=nullptr) is safe -- never dereferenced. "
+         "The signature requires PTO2SharedMemoryRingHeader& "
+         "but it is not accessed in the fast path.";
+}
+
+// Document the sizeof cost: reclaim now takes PTO2SharedMemoryRingHeader which
+// directly contains the three needed fields -- coupling is significantly reduced.
+TEST(DepPoolStubIsolation, ReclaimRequiresExactlyThreeFields_NowOnRingHeader) {
+    // Fields actually needed by reclaim():
+    //   PTO2SharedMemoryRingHeader::slot_states       (8 bytes, pointer)
+    //   PTO2SharedMemoryRingHeader::task_window_mask   (4 bytes, int32_t)
+    //   PTO2TaskSlotState::dep_pool_mark               (4 bytes, int32_t)
+    // Total minimum: 16 bytes of live data.
+    size_t needed_bytes = sizeof(PTO2TaskSlotState *) + sizeof(int32_t) + sizeof(int32_t);
+
+    // Actual cost imposed by PTO2SharedMemoryRingHeader:
+    size_t actual_bytes = sizeof(PTO2SharedMemoryRingHeader);
+
+    EXPECT_GT(actual_bytes, needed_bytes) << "reclaim() needs ~16 bytes of data but requires passing "
+                                             "PTO2SharedMemoryRingHeader ("
+                                          << actual_bytes
+                                          << " bytes). "
+                                             "Ratio: "
+                                          << (actual_bytes / needed_bytes) << "x over-coupling.";
+
+    // Also report the exact sizes for documentation
+    SUCCEED() << "sizeof(PTO2SharedMemoryRingHeader) = " << actual_bytes << " bytes\n"
+              << "sizeof(PTO2TaskSlotState*) + 2*int32_t = " << needed_bytes << " bytes\n"
+              << "sizeof(PTO2TaskSlotState) = " << sizeof(PTO2TaskSlotState);
+}
+
+// =============================================================================
+// Suite 2: SchedulerWithoutOrchestrator
+// =============================================================================
+
+// Scheduler can be fully initialized and destroyed without any orchestrator code.
+// This test links pto_scheduler.cpp + pto_shared_memory.cpp only.
+TEST(SchedulerWithoutOrchestrator, InitAndDestroy_NoOrchestratorNeeded) {
+    PTO2SharedMemoryHandle *sm = pto2_sm_create(SW, SH);
+    ASSERT_NE(sm, nullptr);
+
+    uint8_t *heap = (uint8_t *)calloc(PTO2_MAX_RING_DEPTH, SH);
+    ASSERT_NE(heap, nullptr);
+
+    PTO2SchedulerState sched{};
+    bool ok = pto2_scheduler_init(&sched, sm->header);
+    EXPECT_TRUE(ok) << "pto2_scheduler_init succeeds without orchestrator.cpp in the link. "
+                       "Scheduler is link-time isolated from Orchestrator.";
+
+    EXPECT_EQ(sm->header->rings[0].task_window_size, (uint64_t)SW);
+    EXPECT_EQ(sm->header->rings[0].task_window_mask, SW - 1);
+
+    pto2_scheduler_destroy(&sched);
+    free(heap);
+    pto2_sm_destroy(sm);
+}
+
+// PTO2ReadyQueue is header-only (all methods are inline in pto_scheduler.h).
+// It needs zero .cpp linkage -- only pto_runtime2_types.h for slot type.
+TEST(SchedulerWithoutOrchestrator, ReadyQueue_StandaloneNoExternalDeps) {
+    PTO2ReadyQueue q;
+    pto2_ready_queue_init(&q, 64);
+
+    alignas(64) PTO2TaskSlotState slot{};
+    slot.task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
+
+    EXPECT_TRUE(q.push(&slot));
+    PTO2TaskSlotState *out = q.pop();
+    EXPECT_EQ(out, &slot) << "PTO2ReadyQueue push/pop are entirely header-inline (zero link deps). "
+                             "However, pto2_ready_queue_init / pto2_ready_queue_destroy are free "
+                             "functions defined in pto_scheduler.cpp -- even a standalone ReadyQueue "
+                             "requires linking pto_scheduler.cpp for lifecycle management. "
+                             "Push/pop core logic is self-contained; init/destroy coupling is avoidable.";
+
+    pto2_ready_queue_destroy(&q);
+}
+
+// release_fanin_and_check_ready requires zero TensorMap or Orchestrator linkage.
+// With fanin_count=1, one call makes new_refcount == fanin_count -> push to queue.
+TEST(SchedulerWithoutOrchestrator, ReleaseFanin_PushesWhenFaninMet) {
+    PTO2SharedMemoryHandle *sm = pto2_sm_create(SW, SH);
+    ASSERT_NE(sm, nullptr);
+    uint8_t *heap = (uint8_t *)calloc(PTO2_MAX_RING_DEPTH, SH);
+    ASSERT_NE(heap, nullptr);
+    PTO2SchedulerState sched{};
+    ASSERT_TRUE(pto2_scheduler_init(&sched, sm->header));
+
+    alignas(64) PTO2TaskSlotState slot{};
+    slot.fanin_count = 1;
+    slot.fanin_refcount.store(0, std::memory_order_relaxed);
+    slot.task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
+    slot.active_mask = PTO2_SUBTASK_MASK_AIV0;
+
+    bool became_ready = sched.release_fanin_and_check_ready(slot, nullptr);
+    EXPECT_TRUE(became_ready) << "fanin_count=1, one release -> task is ready";
+
+    // Verify the slot is now in the ready queue
+    PTO2ResourceShape shape = pto2_active_mask_to_shape(slot.active_mask);
+    PTO2TaskSlotState *popped = sched.ready_queues[static_cast<int>(shape)].pop();
+    EXPECT_EQ(popped, &slot) << "Slot found in ready queue -- no Orchestrator involvement";
+
+    pto2_scheduler_destroy(&sched);
+    free(heap);
+    pto2_sm_destroy(sm);
+}
+
+// DESIGN CONTRACT: non-profiling release_fanin_and_check_ready pushes to the
+// ready queue WITHOUT issuing an extra CAS(PENDING->READY) on task_state.
+// The profiling overload (pto_scheduler.h:803-825) performs the CAS purely
+// to be counted in atomic_count; correctness in either build comes from
+// fanin_refcount.fetch_add -- only the decrementer that observes
+// new_refcount == fanin_count pushes the slot, so the ready-queue invariant
+// is preserved even while task_state remains PENDING. This test pins the
+// non-profiling behavior so future edits can't silently add overhead.
+TEST(SchedulerWithoutOrchestrator, NonProfiling_ReleaseFanin_DoesNotCAS_TaskState) {
+#if PTO2_SCHED_PROFILING
+    GTEST_SKIP() << "Test only applies to non-profiling builds (PTO2_SCHED_PROFILING=0)";
+#endif
+    PTO2SharedMemoryHandle *sm = pto2_sm_create(SW, SH);
+    ASSERT_NE(sm, nullptr);
+    uint8_t *heap = (uint8_t *)calloc(PTO2_MAX_RING_DEPTH, SH);
+    ASSERT_NE(heap, nullptr);
+    PTO2SchedulerState sched{};
+    ASSERT_TRUE(pto2_scheduler_init(&sched, sm->header));
+
+    alignas(64) PTO2TaskSlotState slot{};
+    slot.fanin_count = 1;
+    slot.fanin_refcount.store(0, std::memory_order_relaxed);
+    slot.task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
+    slot.active_mask = PTO2_SUBTASK_MASK_AIV0;
+
+    sched.release_fanin_and_check_ready(slot, nullptr);
+
+    PTO2TaskState state_after = slot.task_state.load(std::memory_order_acquire);
+
+    // Design contract: non-profiling path does not mutate task_state here.
+    // Dispatch correctness relies on fanin_refcount's atomic fetch_add, not
+    // on the task_state value at push time.
+    EXPECT_EQ(state_after, PTO2_TASK_PENDING) << "Non-profiling release_fanin_and_check_ready must not CAS task_state; "
+                                                 "the profiling overload's CAS exists only for atomic-op counting.";
+
+    pto2_scheduler_destroy(&sched);
+    free(heap);
+    pto2_sm_destroy(sm);
+}
+
+// on_mixed_task_complete transitions COMPLETED->CONSUMED with a minimal stub descriptor.
+// No TensorMap or Orchestrator calls are made in this path.
+TEST(SchedulerWithoutOrchestrator, OnMixedTaskComplete_StubDescriptor) {
+    PTO2SharedMemoryHandle *sm = pto2_sm_create(SW, SH);
+    ASSERT_NE(sm, nullptr);
+    uint8_t *heap = (uint8_t *)calloc(PTO2_MAX_RING_DEPTH, SH);
+    ASSERT_NE(heap, nullptr);
+    PTO2SchedulerState sched{};
+    ASSERT_TRUE(pto2_scheduler_init(&sched, sm->header));
+
+    auto &rs = sched.ring_sched_states[0];
+    PTO2TaskSlotState &slot = sm->header->rings[0].get_slot_state_by_slot(0);
+
+    PTO2TaskDescriptor dummy_desc{};
+    dummy_desc.packed_buffer_base = nullptr;
+    dummy_desc.packed_buffer_end = nullptr;
+    slot.task = &dummy_desc;
+    slot.ring_id = 0;
+    slot.fanout_count = 1;
+    slot.fanout_refcount.store(1, std::memory_order_relaxed);
+    slot.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_relaxed);
+    sm->header->rings[0].fc.current_task_index.store(1, std::memory_order_relaxed);
+
+    sched.check_and_handle_consumed(slot);
+
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_CONSUMED)
+        << "Scheduler's COMPLETED->CONSUMED path requires only a stub "
+           "PTO2TaskDescriptor (packed_buffer pointers can be nullptr). "
+           "No TensorMap or Orchestrator calls are made in this path.";
+
+    pto2_scheduler_destroy(&sched);
+    free(heap);
+    pto2_sm_destroy(sm);
+}
+
+// =============================================================================
+// Suite 3: TensorMapLinkDecoupling
+// =============================================================================
+
+// This entire file excludes pto_orchestrator.cpp from the link.
+// If TensorMap init/insert/lookup work here, it proves link-time isolation.
+TEST(TensorMapLinkDecoupling, BuildsAndRunsWithoutOrchestratorCpp) {
+    int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {64, 64, 64, 64};
+    PTO2TensorMap tmap{};
+    ASSERT_TRUE(tmap.init(256, 1024, window_sizes));
+
+    Tensor t = make_tensor(0x3000);
+    PTO2TaskId tid = PTO2TaskId::make(0, 0);
+    tmap.insert(t, tid);
+
+    PTO2LookupResult result;
+    tmap.lookup(t, result);
+    EXPECT_GE(result.count, 1) << "TensorMap insert+lookup work without pto_orchestrator.cpp in the link.\n"
+                                  "Root cause: pto_tensormap.cpp includes pto_orchestrator.h (line 22) but\n"
+                                  "calls ZERO orchestrator functions -- confirmed by objdump UND analysis.\n"
+                                  "The include only provides the PTO2OrchestratorState type definition,\n"
+                                  "which is stored as PTO2OrchestratorState* (pointer -- forward decl suffices).";
+
+    tmap.destroy();
+}
+
+// Explicitly set orch = nullptr, then run insert and lookup.
+// If orch were dereferenced in the hot path, this would crash.
+TEST(TensorMapLinkDecoupling, OrchPointer_NeverDereferencedInHotPath) {
+    int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {64, 64, 64, 64};
+    PTO2TensorMap tmap{};
+    ASSERT_TRUE(tmap.init(256, 1024, window_sizes));
+    tmap.orch = nullptr;  // explicitly clear
+
+    Tensor t1 = make_tensor(0x4000, 1, 200);
+    Tensor t2 = make_tensor(0x5000, 1, 100);
+    PTO2TaskId t1id = PTO2TaskId::make(0, 0);
+    PTO2TaskId t2id = PTO2TaskId::make(0, 1);
+    tmap.insert(t1, t1id);
+    tmap.insert(t2, t2id);
+
+    PTO2LookupResult r;
+    tmap.lookup(t1, r);
+    EXPECT_GE(r.count, 1) << "orch=nullptr does not crash insert or lookup. "
+                             "The orch pointer is only used by sync_tensormap (called from orchestrator). "
+                             "In normal usage: orch is set by pto2_orchestrator_init, "
+                             "but insert/lookup never touch it.";
+
+    tmap.destroy();
+}
+
+// sync_tensormap only advances the cleanup clock -- it doesn't access orch.
+// Calling it with orch=nullptr is safe.
+TEST(TensorMapLinkDecoupling, SyncTensormap_DoesNotAccessOrch) {
+    int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {64, 64, 64, 64};
+    PTO2TensorMap tmap{};
+    ASSERT_TRUE(tmap.init(256, 1024, window_sizes));
+    tmap.orch = nullptr;
+
+    // Insert entries for tasks 0..63 in ring 0
+    for (int i = 0; i < 64; i++) {
+        Tensor t = make_tensor(0x6000 + i * 64);
+        tmap.insert(t, PTO2TaskId::make(0, i));
+    }
+
+    // Advance validity: tasks 0..31 are now retired
+    tmap.sync_validity(0, 32);
+
+    // sync_tensormap only calls sync_validity internally -- no orch access
+    tmap.sync_tensormap(PTO2TaskId::make(0, 0), 32);
+
+    // Valid count should reflect only tasks 32..63
+    int valid = tmap.valid_count();
+    EXPECT_LE(valid, 64) << "sync_tensormap(ring_id, last_alive) is purely time-advance logic. "
+                            "No dereference of orch pointer. "
+                            "Cleanup path is independent of OrchestratorState.";
+
+    tmap.destroy();
+}
+
+// Document the transitive include chain caused by one unnecessary #include.
+TEST(TensorMapLinkDecoupling, IncludeCost_OnePointerField_FullRuntimeHeaders) {
+    // pto_tensormap.cpp includes pto_orchestrator.h for PTO2OrchestratorState* orch.
+    // A forward declaration "struct PTO2OrchestratorState;" would be sufficient
+    // because orch is a raw pointer and is never dereferenced in tensormap.cpp.
+    //
+    // Cost of the full include:
+    //   pto_orchestrator.h includes:
+    //     -> pto_scheduler.h -> pto_ring_buffer.h -> pto_shared_memory.h
+    //     -> pto_runtime2_types.h -> pto_types.h, pto_submit_types.h, pto2_dispatch_payload.h
+    //
+    // Every TensorMap compilation unit pulls in the entire runtime header tree
+    // for a single pointer field.
+
+    // Verify: PTO2TensorMap::orch is a raw pointer (not embedded object)
+    EXPECT_EQ(sizeof(PTO2OrchestratorState *), sizeof(void *))
+        << "PTO2OrchestratorState* is a pointer -- sizeof(void*) bytes. "
+           "A forward declaration suffices. "
+           "The full include of pto_orchestrator.h transitively pulls in "
+           "pto_scheduler.h + pto_ring_buffer.h + pto_shared_memory.h + "
+           "pto_runtime2_types.h (7+ headers) for a single 8-byte pointer field.";
+
+    // Also: this test file compiles and links without pto_orchestrator.cpp --
+    // further confirming the include is header-only compile-time coupling.
+    SUCCEED() << "This test file does not link pto_orchestrator.cpp. "
+                 "Build success = confirmed link-time isolation.";
+}
+
+// =============================================================================
+// Suite 4: CompileTimeIncludeCoupling
+// =============================================================================
+
+// pto_ring_buffer.cpp's DepPool::reclaim takes PTO2SharedMemoryRingHeader& directly.
+// ring_buffer.o has ZERO UND symbols from scheduler -- type-level coupling is resolved.
+// The coupling is now to PTO2SharedMemoryRingHeader: accessing struct fields inline.
+TEST(CompileTimeIncludeCoupling, RingBufferCoupledToSharedMemoryAtTypeLevel) {
+    // Demonstrate: DepPool::reclaim is in pto_ring_buffer.cpp (not scheduler)
+    // and it accesses PTO2SharedMemoryRingHeader internal fields inline.
+    // This means: changing PTO2SharedMemoryRingHeader layout silently breaks ring_buffer
+    // without any API change or linker error.
+
+    // Cross-check: the field offset in the stub must match the real struct.
+    MinimalSchedStub stub;
+    ASSERT_TRUE(stub.init(0));
+
+    // Write to dep_pool_mark via stub's slot_array
+    stub.slot_array[63].dep_pool_mark = 99;
+
+    // Read the same field through PTO2SharedMemoryRingHeader's accessor
+    int32_t mark = stub.ring_header.get_slot_state_by_task_id(63).dep_pool_mark;
+    EXPECT_EQ(mark, 99) << "ring_buffer.cpp accesses PTO2SharedMemoryRingHeader::slot_states "
+                           "inline (no virtual dispatch, no function call). "
+                           "Changing the layout of PTO2TaskSlotState or PTO2SharedMemoryRingHeader breaks "
+                           "pto_ring_buffer.cpp without touching any function signature or .h file API. "
+                           "This is a hidden structural coupling: invisible to the linker.";
+
+    stub.destroy();
+}
+
+// Both Scheduler and TensorMap independently compute the same slot index formula.
+// Duplication means if one changes, the other silently diverges.
+TEST(CompileTimeIncludeCoupling, TaskWindowMask_DuplicatedInTwoComponents) {
+    // Scheduler formula (pto_scheduler.h:301):
+    //   slot_states[local_id & task_window_mask]
+    // TensorMap formula (pto_tensormap.h:~364):
+    //   local_id & (task_window_sizes[ring_id] - 1)
+    // Both assume power-of-2 window_size; neither validates it.
+
+    int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {64, 64, 64, 64};
+    PTO2TensorMap tmap{};
+    ASSERT_TRUE(tmap.init(256, 1024, window_sizes));
+
+    PTO2SharedMemoryHandle *sm = pto2_sm_create(64, SH);
+    ASSERT_NE(sm, nullptr);
+    uint8_t *heap = (uint8_t *)calloc(PTO2_MAX_RING_DEPTH, SH);
+    ASSERT_NE(heap, nullptr);
+    PTO2SchedulerState sched{};
+    ASSERT_TRUE(pto2_scheduler_init(&sched, sm->header));
+
+    // Verify both agree for local_id = 37, ring = 0
+    int32_t local_id = 37;
+    int32_t sched_slot = local_id & sm->header->rings[0].task_window_mask;
+    int32_t tmap_slot = local_id & (tmap.task_window_sizes[0] - 1);
+
+    EXPECT_EQ(sched_slot, tmap_slot) << "Scheduler slot = local_id & mask = " << sched_slot
+                                     << "\n"
+                                        "TensorMap slot = local_id & (size-1) = "
+                                     << tmap_slot
+                                     << "\n"
+                                        "Currently agree -- but the formula is written twice, in two components, "
+                                        "with no shared utility. A change to one (e.g., non-power-of-2 support) "
+                                        "would not automatically update the other.";
+
+    pto2_scheduler_destroy(&sched);
+    free(heap);
+    pto2_sm_destroy(sm);
+    tmap.destroy();
+}
+
+// PTO2_MAX_RING_DEPTH propagates into fixed-size arrays in 4 components.
+// Changing it requires recompiling all 4 components simultaneously.
+TEST(CompileTimeIncludeCoupling, MaxRingDepthInFourComponents) {
+    // 1. Orchestrator: rings[PTO2_MAX_RING_DEPTH]  (visible via TMRSystem)
+    // 2. Scheduler: ring_sched_states[PTO2_MAX_RING_DEPTH]
+    static_assert(
+        sizeof(PTO2SchedulerState::ring_sched_states) / sizeof(PTO2SchedulerState::RingSchedState) ==
+            PTO2_MAX_RING_DEPTH,
+        "Scheduler array size must equal PTO2_MAX_RING_DEPTH"
+    );
+
+    // 3. SharedMemory: header->rings[PTO2_MAX_RING_DEPTH]
+    static_assert(
+        sizeof(PTO2SharedMemoryHeader::rings) / sizeof(PTO2SharedMemoryRingHeader) == PTO2_MAX_RING_DEPTH,
+        "SharedMemory array size must equal PTO2_MAX_RING_DEPTH"
+    );
+
+    // 4. TensorMap: task_entry_heads[], task_window_sizes[], last_task_alives[]
+    PTO2TensorMap dummy{};
+    EXPECT_EQ(sizeof(dummy.task_entry_heads) / sizeof(dummy.task_entry_heads[0]), (size_t)PTO2_MAX_RING_DEPTH);
+    EXPECT_EQ(sizeof(dummy.task_window_sizes) / sizeof(dummy.task_window_sizes[0]), (size_t)PTO2_MAX_RING_DEPTH);
+    EXPECT_EQ(sizeof(dummy.last_task_alives) / sizeof(dummy.last_task_alives[0]), (size_t)PTO2_MAX_RING_DEPTH);
+
+    SUCCEED() << "PTO2_MAX_RING_DEPTH=" << PTO2_MAX_RING_DEPTH
+              << " is baked into fixed arrays in Scheduler, SharedMemory, and TensorMap. "
+                 "Changing this constant requires recompiling ALL 4 components. "
+                 "No runtime configurability exists.";
+}
+
+// Including pto_scheduler.h transitively pulls in the entire runtime type hierarchy.
+// Document the breadth of this coupling for a single component include.
+TEST(CompileTimeIncludeCoupling, SchedulerHeaderTransitiveIncludes) {
+    // #include "pto_scheduler.h" causes:
+    //   pto_scheduler.h -> pto_runtime2_types.h  (task state, config constants)
+    //                   -> pto_shared_memory.h   (SM handle, ring headers, flow control)
+    //                       -> pto_runtime2_types.h (again, guarded)
+    //                   -> pto_ring_buffer.h     (TaskAllocator, FaninPool, DepPool, RingSet)
+    //                       -> pto_shared_memory.h (again, guarded)
+    //                   -> common/core_type.h    (CoreType enum)
+    // Total headers transitively included: 6+
+
+    // Verify a few types from the transitive chain are available in this TU
+    // (these would be missing if the includes were broken)
+    PTO2TaskAllocator ta{};                // from pto_ring_buffer.h (consolidated TaskRing + HeapRing)
+    PTO2SharedMemoryHeader smh{};          // from pto_shared_memory.h
+    PTO2TaskState ts = PTO2_TASK_PENDING;  // from pto_runtime2_types.h
+    (void)ta;
+    (void)smh;
+    (void)ts;
+
+    SUCCEED() << "A single #include \"pto_scheduler.h\" makes available: "
+                 "PTO2TaskAllocator, PTO2FaninPool, PTO2DepListPool, "
+                 "PTO2SharedMemoryHandle, PTO2TaskSlotState, PTO2TaskState, "
+                 "PTO2ReadyQueue, CoreType -- the entire runtime type set. "
+                 "This creates a broad compile-time coupling surface.";
+}
+
+// =============================================================================
+// Suite 5: ProfilingBehaviorCoupling
+// =============================================================================
+
+// The non-profiling release_fanin_and_check_ready (lines 426-448) does NOT
+// perform CAS(PENDING->READY) before pushing to the ready queue.
+// The profiling overload (lines 450-476) DOES perform the CAS.
+// Document this divergence as a structural coupling of profiling to correctness.
+TEST(ProfilingBehaviorCoupling, ProfilingAndNonProfiling_DifferentStateAfterRelease) {
+    PTO2SharedMemoryHandle *sm = pto2_sm_create(SW, SH);
+    ASSERT_NE(sm, nullptr);
+    uint8_t *heap = (uint8_t *)calloc(PTO2_MAX_RING_DEPTH, SH);
+    ASSERT_NE(heap, nullptr);
+    PTO2SchedulerState sched{};
+    ASSERT_TRUE(pto2_scheduler_init(&sched, sm->header));
+
+    alignas(64) PTO2TaskSlotState slot{};
+    slot.fanin_count = 1;
+    slot.fanin_refcount.store(0, std::memory_order_relaxed);
+    slot.task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
+    slot.active_mask = PTO2_SUBTASK_MASK_AIV0;
+
+    sched.release_fanin_and_check_ready(slot, nullptr);
+
+    PTO2TaskState state = slot.task_state.load(std::memory_order_acquire);
+
+#if PTO2_SCHED_PROFILING
+    // Profiling path: CAS was performed -> READY
+    EXPECT_EQ(state, PTO2_TASK_READY) << "Profiling build: CAS(PENDING->READY) executed before push. "
+                                         "Worker will see READY state when it pops this slot.";
+#else
+    // Non-profiling path: no CAS -> still PENDING
+    EXPECT_EQ(state, PTO2_TASK_PENDING) << "Non-profiling build: slot pushed to ready queue with task_state=PENDING.\n"
+                                           "PTO2_SCHED_PROFILING flag changes CORRECTNESS, not just measurement.\n"
+                                           "See pto_scheduler.h lines 426-448 (non-profiling) vs 450-476 (profiling).";
+#endif
+
+    pto2_scheduler_destroy(&sched);
+    free(heap);
+    pto2_sm_destroy(sm);
+}
+
+// The profiling overload has an additional CAS guard that prevents double-push.
+// The non-profiling overload relies on the caller ensuring exactly-once delivery.
+// Document the API asymmetry as a coupling risk.
+TEST(ProfilingBehaviorCoupling, ProfilingOverload_HasCASGuard_NonProfilingDoesNot) {
+    // Non-profiling signature (lines 426-448):
+    //   bool release_fanin_and_check_ready(slot, local_bufs = nullptr)
+    //   -> pushes unconditionally when fanin met; no CAS guard
+    //
+    // Profiling signature (lines 450-476):
+    //   bool release_fanin_and_check_ready(slot, atomic_count, push_wait, local_bufs)
+    //   -> CAS(PENDING->READY); only pushes if CAS succeeds
+    //   -> if two threads race and both see new_refcount==fanin_count,
+    //     only ONE will win the CAS; the other returns false (no double-push)
+    //
+    // Non-profiling has no such guard: if two threads both see new_refcount==fanin_count
+    // (which shouldn't happen due to fetch_add atomicity, but still an asymmetry),
+    // both would push.
+
+    // Verify the non-profiling path returns true whenever fanin_count is met
+    PTO2SharedMemoryHandle *sm = pto2_sm_create(SW, SH);
+    ASSERT_NE(sm, nullptr);
+    uint8_t *heap = (uint8_t *)calloc(PTO2_MAX_RING_DEPTH, SH);
+    ASSERT_NE(heap, nullptr);
+    PTO2SchedulerState sched{};
+    ASSERT_TRUE(pto2_scheduler_init(&sched, sm->header));
+
+    alignas(64) PTO2TaskSlotState slot{};
+    slot.fanin_count = 2;
+    slot.fanin_refcount.store(0, std::memory_order_relaxed);
+    slot.task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
+    slot.active_mask = PTO2_SUBTASK_MASK_AIV0;
+
+    bool r1 = sched.release_fanin_and_check_ready(slot, nullptr);  // refcount->1, !=2
+    bool r2 = sched.release_fanin_and_check_ready(slot, nullptr);  // refcount->2, ==2
+
+    EXPECT_FALSE(r1) << "First release: refcount=1 != fanin_count=2 -> not ready";
+    EXPECT_TRUE(r2) << "Second release: refcount=2 == fanin_count=2 -> ready, pushed";
+
+    SUCCEED() << "Non-profiling path: return true means 'pushed to queue'. "
+                 "Profiling path: return true means 'CAS succeeded AND pushed'. "
+                 "The distinction matters for exactly-once delivery guarantees "
+                 "under concurrent access -- the non-profiling version trusts "
+                 "fetch_add atomicity alone to prevent double-push.";
+
+    pto2_scheduler_destroy(&sched);
+    free(heap);
+    pto2_sm_destroy(sm);
+}
+
+// Profiling externs are declared inside #if blocks in hot-path headers.
+// In non-profiling builds they are absent, but the conditional preprocessor blocks
+// are part of the header's cognitive surface -- coupling profiling concern to the header.
+TEST(ProfilingBehaviorCoupling, ProfilingExterns_InHotPathHeaders) {
+    // pto_scheduler.h declares (inside #if PTO2_SCHED_PROFILING):
+    //   extern uint64_t g_sched_lock_cycle[];
+    //   extern uint64_t g_sched_fanout_cycle[];
+    //   ... (8+ extern arrays, used in on_mixed_task_complete)
+    //
+    // pto_ring_buffer.h declares (inside #if PTO2_ORCH_PROFILING):
+    //   extern uint64_t g_orch_heap_wait_cycle;
+    //   extern uint64_t g_orch_heap_atomic_count;
+    //   ... (4+ extern scalars, used in heap_ring_try_alloc)
+    //
+    // These externs sit inside headers that are included in hot-path code.
+    // The profiling concern bleeds into the compile model of all translation units
+    // that include these headers.
+
+#if PTO2_SCHED_PROFILING
+    // In profiling build: the externs must be defined somewhere -- test stubs must provide them
+    SUCCEED() << "PTO2_SCHED_PROFILING=1: profiling externs are live in this build. "
+                 "They are declared in pto_scheduler.h and used in on_mixed_task_complete.";
+#else
+    // In non-profiling build: externs are absent -- but the #if blocks remain in the header
+    SUCCEED() << "PTO2_SCHED_PROFILING=0: profiling extern declarations are compiled out. "
+                 "However, the #if PTO2_SCHED_PROFILING blocks in pto_scheduler.h "
+                 "and pto_ring_buffer.h add conditional complexity to every reader "
+                 "of these hot-path headers. Profiling coupling cannot be extracted "
+                 "without modifying the headers themselves.";
+#endif
+
+    // Regardless of flag: the behavioral difference in release_fanin_and_check_ready
+    // means profiling and non-profiling builds have different task state semantics.
+    // This is the most significant coupling: a measurement flag alters correctness.
+    size_t slot_size = sizeof(PTO2TaskSlotState);
+    EXPECT_EQ(slot_size, 64u) << "PTO2TaskSlotState is 64 bytes (1 cache line). "
+                                 "Profiling adds atomic counters to PTO2SchedulerState (tasks_completed, "
+                                 "tasks_consumed) when PTO2_SCHED_PROFILING=1, potentially inflating the struct.";
+}
diff --git a/tests/ut/cpp/pto2_a2a3/test_orchestrator_fatal.cpp b/tests/ut/cpp/pto2_a2a3/test_orchestrator_fatal.cpp
new file mode 100644
index 000000000..7eb012216
--- /dev/null
+++ b/tests/ut/cpp/pto2_a2a3/test_orchestrator_fatal.cpp
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * UT for the orchestrator-side fatal reporting path.
+ *
+ * Targets pto2_orch_report_fatal (pto_orchestrator.cpp) and verifies:
+ *  - orch->fatal latches to true on any non-zero error code
+ *  - the first non-zero code wins via CAS into sm_header->orch_error_code
+ *  - subsequent fatal reports do NOT overwrite the first code
+ *  - PTO2_ERROR_NONE never latches the shared-memory code (but still flips
+ *    the local fatal flag -- by design, callers may use it to mark fatal
+ *    without writing a code)
+ *  - resilience when sm_handle / header is null (no crash, local flag flips)
+ *
+ * This test exercises the real symbol against a fully-initialized
+ * orchestrator + shared memory pair, complementing the fake-runtime test
+ * (test_a2a3_pto2_fatal.cpp) that only validates the ops-table dispatch.
+ */
+
+#include <gtest/gtest.h>
+
+#include <cstdint>
+#include <cstdlib>
+
+#include "pto_orchestrator.h"
+#include "pto_runtime_status.h"
+#include "pto_scheduler.h"
+#include "pto_shared_memory.h"
+
+namespace {
+
+constexpr uint64_t kHeapSize = 64 * 1024;
+constexpr int32_t kWindowSize = 64;
+constexpr int32_t kDepPool = 256;
+
+class OrchestratorFatalTest : public ::testing::Test {
+protected:
+    PTO2SharedMemoryHandle *sm_ = nullptr;
+    PTO2SchedulerState sched_{};
+    PTO2OrchestratorState orch_{};
+    uint8_t *gm_heap_ = nullptr;
+    bool sched_ok_ = false;
+    bool orch_ok_ = false;
+
+    void SetUp() override {
+        sm_ = pto2_sm_create(kWindowSize, kHeapSize);
+        ASSERT_NE(sm_, nullptr);
+
+        gm_heap_ = static_cast<uint8_t *>(std::calloc(PTO2_MAX_RING_DEPTH, kHeapSize));
+        ASSERT_NE(gm_heap_, nullptr);
+
+        sched_ok_ = pto2_scheduler_init(&sched_, sm_->header, kDepPool);
+        ASSERT_TRUE(sched_ok_);
+
+        orch_ok_ = pto2_orchestrator_init(&orch_, sm_->header, gm_heap_, kHeapSize, kDepPool);
+        ASSERT_TRUE(orch_ok_);
+    }
+
+    void TearDown() override {
+        if (orch_ok_) pto2_orchestrator_destroy(&orch_);
+        if (sched_ok_) pto2_scheduler_destroy(&sched_);
+        if (gm_heap_) std::free(gm_heap_);
+        if (sm_) pto2_sm_destroy(sm_);
+    }
+
+    int32_t shared_orch_code() const { return sm_->header->orch_error_code.load(std::memory_order_acquire); }
+};
+
+}  // namespace
+
+// ---------- baseline ----------
+
+TEST_F(OrchestratorFatalTest, InitialState_NoFatalNoSharedCode) {
+    // Verify no fatal state via the observable shared memory output
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_NONE);
+}
+
+// ---------- happy path: single fatal latches both local flag and shared code ----------
+
+TEST_F(OrchestratorFatalTest, ReportFatal_SetsLocalFlagAndSharedCode) {
+    pto2_orch_report_fatal(&orch_, PTO2_ERROR_HEAP_RING_DEADLOCK, "test", "deadlock at ring %d", 3);
+
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_HEAP_RING_DEADLOCK);
+}
+
+// ---------- CAS first-writer-wins ----------
+
+TEST_F(OrchestratorFatalTest, SecondReportFatal_DoesNotOverwriteSharedCode) {
+    pto2_orch_report_fatal(&orch_, PTO2_ERROR_HEAP_RING_DEADLOCK, "test", nullptr);
+    pto2_orch_report_fatal(&orch_, PTO2_ERROR_DEP_POOL_OVERFLOW, "test", nullptr);
+
+    // Second report must NOT overwrite the first latched code.
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_HEAP_RING_DEADLOCK);
+}
+
+TEST_F(OrchestratorFatalTest, RepeatedSameCode_StaysLatched) {
+    pto2_orch_report_fatal(&orch_, PTO2_ERROR_INVALID_ARGS, "test", nullptr);
+    pto2_orch_report_fatal(&orch_, PTO2_ERROR_INVALID_ARGS, "test", nullptr);
+
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_INVALID_ARGS);
+}
+
+// ---------- PTO2_ERROR_NONE: marks fatal locally, does NOT touch shared code ----------
+
+TEST_F(OrchestratorFatalTest, ReportFatalWithErrorNone_DoesNotWriteSharedCode) {
+    pto2_orch_report_fatal(&orch_, PTO2_ERROR_NONE, "test", nullptr);
+
+    // Local fatal flag flips (tested via another report not latching a different code),
+    // but no code is written to shared memory.
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_NONE);
+}
+
+// ---------- PTO2_ERROR_NONE first does not block a real code from latching ----------
+
+TEST_F(OrchestratorFatalTest, ErrorNoneFirst_RealCodeStillLatchesAfter) {
+    pto2_orch_report_fatal(&orch_, PTO2_ERROR_NONE, "test", nullptr);
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_NONE);
+
+    pto2_orch_report_fatal(&orch_, PTO2_ERROR_SCOPE_DEADLOCK, "test", nullptr);
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_SCOPE_DEADLOCK);
+}
+
+// ---------- coverage of every defined orchestrator code ----------
+
+TEST_F(OrchestratorFatalTest, EveryOrchCode_LatchesIntoSharedMemory) {
+    const int32_t codes[] = {
+        PTO2_ERROR_SCOPE_DEADLOCK,
+        PTO2_ERROR_HEAP_RING_DEADLOCK,
+        PTO2_ERROR_FLOW_CONTROL_DEADLOCK,
+        PTO2_ERROR_DEP_POOL_OVERFLOW,
+        PTO2_ERROR_INVALID_ARGS,
+        PTO2_ERROR_DEPENDENCY_OVERFLOW,
+        PTO2_ERROR_REQUIRE_SYNC_START_INVALID,
+        PTO2_ERROR_TENSOR_WAIT_TIMEOUT,
+        PTO2_ERROR_EXPLICIT_ORCH_FATAL,
+    };
+    for (int32_t code : codes) {
+        // Reset latches between iterations. Direct field access is unavoidable here
+        // since there is no public reset API for the orchestrator fatal state.
+        sm_->header->orch_error_code.store(PTO2_ERROR_NONE, std::memory_order_release);
+        orch_.fatal = false;
+
+        pto2_orch_report_fatal(&orch_, code, "test", "code=%d", code);
+
+        SCOPED_TRACE(testing::Message() << "code=" << code);
+        EXPECT_EQ(shared_orch_code(), code);
+    }
+}
+
+// ---------- format-string variants must not crash ----------
+
+TEST_F(OrchestratorFatalTest, NullFmt_DoesNotCrash) {
+    pto2_orch_report_fatal(&orch_, PTO2_ERROR_INVALID_ARGS, "func", nullptr);
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_INVALID_ARGS);
+}
+
+TEST_F(OrchestratorFatalTest, EmptyFmt_DoesNotCrash) {
+    pto2_orch_report_fatal(&orch_, PTO2_ERROR_INVALID_ARGS, "func", "");
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_INVALID_ARGS);
+}
+
+TEST_F(OrchestratorFatalTest, FmtWithVarArgs_DoesNotCrash) {
+    pto2_orch_report_fatal(
+        &orch_, PTO2_ERROR_TENSOR_WAIT_TIMEOUT, "func", "tensor=%p slot=%d msg=%s",
+        reinterpret_cast<void *>(0xdeadbeef), 17, "boom"
+    );
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_TENSOR_WAIT_TIMEOUT);
+}
+
+// ---------- end-to-end: status helper sees latched code ----------
+
+TEST_F(OrchestratorFatalTest, StatusHelperReadsLatchedOrchCode) {
+    pto2_orch_report_fatal(&orch_, PTO2_ERROR_FLOW_CONTROL_DEADLOCK, "func", nullptr);
+
+    int32_t orch_code = shared_orch_code();
+    int32_t sched_code = sm_->header->sched_error_code.load(std::memory_order_acquire);
+    EXPECT_EQ(pto2_runtime_status_from_error_codes(orch_code, sched_code), -PTO2_ERROR_FLOW_CONTROL_DEADLOCK);
+}
diff --git a/tests/ut/cpp/pto2_a2a3/test_orchestrator_submit.cpp b/tests/ut/cpp/pto2_a2a3/test_orchestrator_submit.cpp
new file mode 100644
index 000000000..a5afb3b3b
--- /dev/null
+++ b/tests/ut/cpp/pto2_a2a3/test_orchestrator_submit.cpp
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * Orchestrator submit-path UT.
+ *
+ * Covers pto2_submit_mixed_task, pto2_alloc_tensors, pto2_orchestrator_done,
+ * and pto2_orchestrator_set_scheduler on a fully initialized
+ * (TMR) system.
+ *
+ * Follows AAA and FIRST: each TEST_F builds a fresh TMRSystem, exercises
+ * one behavior, and tears the system down in TearDown().
+ */
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <cstdint>
+#include <cstdlib>
+
+#include "pto_orchestration_api.h"  // make_tensor_external, TensorCreateInfo ctor
+#include "pto_orchestrator.h"
+#include "pto_ring_buffer.h"
+#include "pto_scheduler.h"
+#include "pto_shared_memory.h"
+#include "pto_submit_types.h"
+#include "pto_tensormap.h"
+#include "tensor.h"
+
+namespace {
+
+constexpr uint64_t kHeapSize = 64 * 1024;
+constexpr int32_t kWindowSize = 64;
+constexpr int32_t kDepPool = 256;
+
+// -----------------------------------------------------------------------------
+// Fixture: minimal TMR system for orchestrator-level tests.
+// -----------------------------------------------------------------------------
+class OrchestratorSubmitTest : public ::testing::Test {
+protected:
+    PTO2SharedMemoryHandle *sm_ = nullptr;
+    PTO2SchedulerState sched_{};
+    PTO2OrchestratorState orch_{};
+    uint8_t *gm_heap_ = nullptr;
+    bool sched_ok_ = false;
+    bool orch_ok_ = false;
+
+    void SetUp() override {
+        sm_ = pto2_sm_create(kWindowSize, kHeapSize);
+        ASSERT_NE(sm_, nullptr);
+
+        gm_heap_ = static_cast<uint8_t *>(std::calloc(PTO2_MAX_RING_DEPTH, kHeapSize));
+        ASSERT_NE(gm_heap_, nullptr);
+
+        sched_ok_ = pto2_scheduler_init(&sched_, sm_->header, kDepPool);
+        ASSERT_TRUE(sched_ok_);
+
+        orch_ok_ = pto2_orchestrator_init(&orch_, sm_->header, gm_heap_, kHeapSize, kDepPool);
+        ASSERT_TRUE(orch_ok_);
+
+        pto2_orchestrator_set_scheduler(&orch_, &sched_);
+    }
+
+    void TearDown() override {
+        if (orch_ok_) pto2_orchestrator_destroy(&orch_);
+        if (sched_ok_) pto2_scheduler_destroy(&sched_);
+        if (gm_heap_) std::free(gm_heap_);
+        if (sm_) pto2_sm_destroy(sm_);
+    }
+
+    // Helper: build a minimal TensorCreateInfo owning one FP32 scalar output.
+    static TensorCreateInfo make_scalar_ci() {
+        static const uint32_t kShape[1] = {1};
+        return TensorCreateInfo(kShape, 1, DataType::FLOAT32);
+    }
+
+    bool has_orch_error() const {
+        return sm_->header->orch_error_code.load(std::memory_order_acquire) != PTO2_ERROR_NONE;
+    }
+};
+
+}  // namespace
+
+// ---------- set_scheduler ----------
+
+TEST_F(OrchestratorSubmitTest, SetScheduler_StoresPointer) {
+    PTO2SchedulerState other{};
+    pto2_orchestrator_set_scheduler(&orch_, &other);
+    // Direct field read: no public getter exists for the scheduler pointer.
+    EXPECT_EQ(orch_.scheduler, &other);
+
+    // Restore for TearDown.
+    pto2_orchestrator_set_scheduler(&orch_, &sched_);
+}
+
+// ---------- alloc_tensors: argument validation ----------
+
+TEST_F(OrchestratorSubmitTest, AllocTensors_EmptyArgs_MarksFatal) {
+    Arg args;  // no tensors, no scalars
+
+    TaskOutputTensors result = pto2_alloc_tensors(&orch_, args);
+
+    EXPECT_TRUE(result.empty());
+    EXPECT_TRUE(has_orch_error());
+}
+
+TEST_F(OrchestratorSubmitTest, AllocTensors_WithScalars_MarksFatal) {
+    TensorCreateInfo ci = make_scalar_ci();
+    Arg args;
+    args.add_output(ci);
+    args.add_scalar(uint64_t{42});
+
+    TaskOutputTensors result = pto2_alloc_tensors(&orch_, args);
+
+    EXPECT_TRUE(result.empty());
+    EXPECT_TRUE(has_orch_error());
+}
+
+TEST_F(OrchestratorSubmitTest, AllocTensors_InputArg_MarksFatal) {
+    // alloc_tensors only accepts OUTPUT TensorCreateInfo args.
+    uint32_t shape[1] = {1};
+    Tensor input = make_tensor_external(reinterpret_cast<void *>(0x1000), shape, 1);
+    Arg args;
+    args.add_input(input);
+
+    TaskOutputTensors result = pto2_alloc_tensors(&orch_, args);
+
+    EXPECT_TRUE(result.empty());
+    EXPECT_TRUE(has_orch_error());
+}
+
+TEST_F(OrchestratorSubmitTest, AllocTensors_OutputOnly_ReturnsMaterializedTensors) {
+    // Arrange: two output CIs, inside an active scope.
+    TensorCreateInfo ci1 = make_scalar_ci();
+    TensorCreateInfo ci2 = make_scalar_ci();
+    Arg args;
+    args.add_output(ci1, ci2);
+
+    // Act
+    pto2_scope_begin(&orch_);
+    TaskOutputTensors result = pto2_alloc_tensors(&orch_, args);
+    pto2_scope_end(&orch_);
+
+    // Assert
+    EXPECT_FALSE(has_orch_error());
+    EXPECT_EQ(result.size(), 2U);
+}
+
+TEST_F(OrchestratorSubmitTest, AllocTensors_AfterFatal_ReturnsEmpty) {
+    // Arrange: force fatal.
+    pto2_orch_report_fatal(&orch_, PTO2_ERROR_EXPLICIT_ORCH_FATAL, "UT", nullptr);
+    ASSERT_TRUE(has_orch_error());
+
+    TensorCreateInfo ci = make_scalar_ci();
+    Arg args;
+    args.add_output(ci);
+
+    // Act
+    TaskOutputTensors result = pto2_alloc_tensors(&orch_, args);
+
+    // Assert
+    EXPECT_TRUE(result.empty());
+}
+
+// ---------- submit_mixed_task ----------
+
+TEST_F(OrchestratorSubmitTest, SubmitMixedTask_AfterFatal_ReturnsEmpty) {
+    // Arrange: pre-fatal state
+    pto2_orch_report_fatal(&orch_, PTO2_ERROR_EXPLICIT_ORCH_FATAL, "UT", nullptr);
+
+    MixedKernels mixed;
+    mixed.aic_kernel_id = 0;
+    Arg args;
+
+    // Act
+    TaskOutputTensors result = pto2_submit_mixed_task(&orch_, mixed, args);
+
+    // Assert
+    EXPECT_TRUE(result.empty());
+}
+
+TEST_F(OrchestratorSubmitTest, SubmitMixedTask_ArgWithError_MarksFatalInvalidArgs) {
+    // Arrange: craft an Arg with has_error set.
+    // Calling add_input after add_scalar triggers the ordering error path.
+    uint32_t shape[1] = {1};
+    Tensor t = make_tensor_external(reinterpret_cast<void *>(0x1000), shape, 1);
+    Arg args;
+    args.add_scalar(uint64_t{1});
+    args.add_input(t);  // illegal ordering -> has_error = true
+    ASSERT_TRUE(args.has_error);
+
+    MixedKernels mixed;
+    mixed.aic_kernel_id = 0;
+
+    // Act
+    pto2_scope_begin(&orch_);
+    TaskOutputTensors result = pto2_submit_mixed_task(&orch_, mixed, args);
+    pto2_scope_end(&orch_);
+
+    // Assert
+    EXPECT_TRUE(result.empty());
+    EXPECT_TRUE(has_orch_error());
+}
+
+TEST_F(OrchestratorSubmitTest, SubmitMixedTask_PureInputOnly_Succeeds) {
+    // Arrange: one input tensor, one AIC kernel, within a scope.
+    uint32_t shape[1] = {1};
+    Tensor input = make_tensor_external(reinterpret_cast<void *>(0x2000), shape, 1);
+
+    Arg args;
+    args.add_input(input);
+    ASSERT_FALSE(args.has_error);
+
+    MixedKernels mixed;
+    mixed.aic_kernel_id = 7;  // any non-invalid id
+
+    // Act
+    pto2_scope_begin(&orch_);
+    TaskOutputTensors result = pto2_submit_mixed_task(&orch_, mixed, args);
+    pto2_scope_end(&orch_);
+
+    // Assert: submit returns (no outputs), and no fatal state was set.
+    EXPECT_TRUE(result.empty());
+    EXPECT_FALSE(has_orch_error());
+}
+
+TEST_F(OrchestratorSubmitTest, SubmitMixedTask_OutputTensor_MaterializesResult) {
+    // Arrange: one OUTPUT TensorCreateInfo -> task produces one tensor.
+    TensorCreateInfo ci = make_scalar_ci();
+    Arg args;
+    args.add_output(ci);
+
+    MixedKernels mixed;
+    mixed.aic_kernel_id = 1;
+
+    // Act
+    pto2_scope_begin(&orch_);
+    TaskOutputTensors result = pto2_submit_mixed_task(&orch_, mixed, args);
+    pto2_scope_end(&orch_);
+
+    // Assert
+    EXPECT_FALSE(has_orch_error());
+    EXPECT_EQ(result.size(), 1U);
+}
+
+// ---------- orchestrator_done ----------
+
+TEST_F(OrchestratorSubmitTest, OrchestratorDone_SetsSharedMemoryFlag) {
+    // Arrange
+    ASSERT_EQ(sm_->header->orchestrator_done.load(), 0);
+
+    // Act
+    pto2_orchestrator_done(&orch_);
+
+    // Assert
+    EXPECT_EQ(sm_->header->orchestrator_done.load(std::memory_order_acquire), 1);
+}
+
+TEST_F(OrchestratorSubmitTest, OrchestratorDone_IsIdempotent) {
+    pto2_orchestrator_done(&orch_);
+    pto2_orchestrator_done(&orch_);
+
+    // Flag stays 1 -- store is release-set, not increment.
+    EXPECT_EQ(sm_->header->orchestrator_done.load(std::memory_order_acquire), 1);
+}
diff --git a/tests/ut/cpp/pto2_a2a3/test_runtime_lifecycle.cpp b/tests/ut/cpp/pto2_a2a3/test_runtime_lifecycle.cpp
new file mode 100644
index 000000000..6084c2d25
--- /dev/null
+++ b/tests/ut/cpp/pto2_a2a3/test_runtime_lifecycle.cpp
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * PTO2 Runtime lifecycle UT.
+ *
+ * Covers pto2_runtime_create / _custom / _from_sm / _destroy / set_mode.
+ *
+ * Follows AAA and FIRST: no shared mutable state between tests, each test
+ * constructs its own runtime and tears it down.
+ */
+
+#include <gtest/gtest.h>
+
+#include <cstdint>
+
+#include "pto_runtime2.h"
+#include "pto_shared_memory.h"
+
+namespace {
+
+constexpr uint64_t kSmallWindow = 64;
+constexpr uint64_t kSmallHeap = 64 * 1024;
+
+// -----------------------------------------------------------------------------
+// Fixture: each test gets a fresh, isolated runtime config.
+// -----------------------------------------------------------------------------
+class RuntimeLifecycleTest : public ::testing::Test {
+protected:
+    PTO2Runtime *rt_ = nullptr;
+
+    void TearDown() override {
+        if (rt_ != nullptr) {
+            pto2_runtime_destroy(rt_);
+            rt_ = nullptr;
+        }
+    }
+};
+
+}  // namespace
+
+// ---------- Happy-path creation ----------
+
+TEST_F(RuntimeLifecycleTest, CreateCustom_ValidSizes_ReturnsInitializedRuntime) {
+    // Arrange + Act
+    rt_ = pto2_runtime_create_custom(PTO2_MODE_SIMULATE, kSmallWindow, kSmallHeap);
+
+    // Assert
+    ASSERT_NE(rt_, nullptr);
+    EXPECT_NE(rt_->ops, nullptr);
+    EXPECT_NE(rt_->sm_handle, nullptr);
+    EXPECT_NE(rt_->gm_heap, nullptr);
+    EXPECT_TRUE(rt_->gm_heap_owned);
+    EXPECT_EQ(rt_->mode, PTO2_MODE_SIMULATE);
+    EXPECT_EQ(rt_->gm_heap_size, kSmallHeap * PTO2_MAX_RING_DEPTH);
+}
+
+TEST_F(RuntimeLifecycleTest, CreateCustom_ConnectsOrchestratorToScheduler) {
+    rt_ = pto2_runtime_create_custom(PTO2_MODE_EXECUTE, kSmallWindow, kSmallHeap);
+
+    ASSERT_NE(rt_, nullptr);
+    // In simulated mode the orchestrator must hold a pointer to the scheduler.
+    EXPECT_EQ(rt_->orchestrator.scheduler, &rt_->scheduler);
+}
+
+TEST_F(RuntimeLifecycleTest, CreateDefault_UsesDefaultSizes) {
+    // create() is a thin wrapper around create_custom with PTO2_TASK_WINDOW_SIZE / PTO2_HEAP_SIZE.
+    // Use GRAPH_ONLY to avoid executor threads.  We don't allocate the full
+    // 256MB heap in this path -- keep the assertion restricted to mode.
+    rt_ = pto2_runtime_create(PTO2_MODE_GRAPH_ONLY);
+    ASSERT_NE(rt_, nullptr);
+    EXPECT_EQ(rt_->mode, PTO2_MODE_GRAPH_ONLY);
+}
+
+// ---------- From-SM creation ----------
+
+TEST_F(RuntimeLifecycleTest, CreateFromSM_NullHandle_ReturnsNull) {
+    // Act
+    PTO2Runtime *rt = pto2_runtime_create_from_sm(PTO2_MODE_SIMULATE, nullptr, nullptr, 0);
+
+    // Assert
+    EXPECT_EQ(rt, nullptr);
+}
+
+TEST_F(RuntimeLifecycleTest, CreateFromSM_RecordsCallerBuffers) {
+    // Arrange: caller-allocated sm + gm_heap.
+    PTO2SharedMemoryHandle *sm = pto2_sm_create(kSmallWindow, kSmallHeap);
+    ASSERT_NE(sm, nullptr);
+    uint8_t *heap = static_cast<uint8_t *>(std::calloc(PTO2_MAX_RING_DEPTH, kSmallHeap));
+    ASSERT_NE(heap, nullptr);
+
+    // Act
+    rt_ = pto2_runtime_create_from_sm(PTO2_MODE_EXECUTE, sm, heap, kSmallHeap);
+
+    // Assert: the returned runtime must NOT claim ownership of the gm_heap.
+    ASSERT_NE(rt_, nullptr);
+    EXPECT_EQ(rt_->sm_handle, sm);
+    EXPECT_EQ(rt_->gm_heap, heap);
+    EXPECT_FALSE(rt_->gm_heap_owned);
+
+    // Cleanup: pto2_runtime_destroy consumes sm via pto2_sm_destroy (observed
+    // behavior, see pto_runtime2.cpp:339), so only free the gm_heap here.
+    pto2_runtime_destroy(rt_);
+    rt_ = nullptr;
+    std::free(heap);
+}
+
+// ---------- Destroy ----------
+
+TEST_F(RuntimeLifecycleTest, Destroy_NullRuntime_NoCrash) {
+    // Documented contract: destroy(nullptr) is a no-op.
+    pto2_runtime_destroy(nullptr);
+    SUCCEED();
+}
+
+TEST_F(RuntimeLifecycleTest, Destroy_ReleasesOwnedHeap) {
+    rt_ = pto2_runtime_create_custom(PTO2_MODE_SIMULATE, kSmallWindow, kSmallHeap);
+    ASSERT_NE(rt_, nullptr);
+    // Act: explicitly destroy and null out so TearDown doesn't double-free.
+    pto2_runtime_destroy(rt_);
+    rt_ = nullptr;
+    // Assert: reaching here without asan/ubsan complaint is the test (leak-free).
+    SUCCEED();
+}
+
+// ---------- set_mode ----------
+
+TEST_F(RuntimeLifecycleTest, SetMode_UpdatesField) {
+    rt_ = pto2_runtime_create_custom(PTO2_MODE_EXECUTE, kSmallWindow, kSmallHeap);
+    ASSERT_NE(rt_, nullptr);
+    ASSERT_EQ(rt_->mode, PTO2_MODE_EXECUTE);
+
+    // Act
+    pto2_runtime_set_mode(rt_, PTO2_MODE_GRAPH_ONLY);
+
+    // Assert
+    EXPECT_EQ(rt_->mode, PTO2_MODE_GRAPH_ONLY);
+}
+
+TEST_F(RuntimeLifecycleTest, SetMode_NullRuntime_NoCrash) {
+    // Contract: defensive null check, mirrors destroy.
+    pto2_runtime_set_mode(nullptr, PTO2_MODE_SIMULATE);
+    SUCCEED();
+}
+
+// ---------- Ops table wiring ----------
+
+TEST_F(RuntimeLifecycleTest, OpsTable_AllFunctionPointersPopulated) {
+    rt_ = pto2_runtime_create_custom(PTO2_MODE_SIMULATE, kSmallWindow, kSmallHeap);
+    ASSERT_NE(rt_, nullptr);
+    const PTO2RuntimeOps *ops = rt_->ops;
+    ASSERT_NE(ops, nullptr);
+
+    // Hot-path ops called by the orchestration .so -- must never be null.
+    EXPECT_NE(ops->submit_task, nullptr);
+    EXPECT_NE(ops->alloc_tensors, nullptr);
+    EXPECT_NE(ops->scope_begin, nullptr);
+    EXPECT_NE(ops->scope_end, nullptr);
+    EXPECT_NE(ops->orchestration_done, nullptr);
+    EXPECT_NE(ops->is_fatal, nullptr);
+    EXPECT_NE(ops->report_fatal, nullptr);
+    EXPECT_NE(ops->get_tensor_data, nullptr);
+    EXPECT_NE(ops->set_tensor_data, nullptr);
+}
+
+TEST_F(RuntimeLifecycleTest, IsFatal_FreshRuntime_ReturnsFalse) {
+    rt_ = pto2_runtime_create_custom(PTO2_MODE_SIMULATE, kSmallWindow, kSmallHeap);
+    ASSERT_NE(rt_, nullptr);
+    EXPECT_FALSE(rt_->ops->is_fatal(rt_));
+}
+
+TEST_F(RuntimeLifecycleTest, ReportFatal_SetsFatalFlag) {
+    rt_ = pto2_runtime_create_custom(PTO2_MODE_SIMULATE, kSmallWindow, kSmallHeap);
+    ASSERT_NE(rt_, nullptr);
+
+    // Act
+    rt_->ops->report_fatal(rt_, PTO2_ERROR_EXPLICIT_ORCH_FATAL, "UT", "%s", "forced");
+
+    // Assert
+    EXPECT_TRUE(rt_->ops->is_fatal(rt_));
+}
diff --git a/tests/ut/cpp/pto2_a2a3/test_tensormap_edge.cpp b/tests/ut/cpp/pto2_a2a3/test_tensormap_edge.cpp
new file mode 100644
index 000000000..db409ac57
--- /dev/null
+++ b/tests/ut/cpp/pto2_a2a3/test_tensormap_edge.cpp
@@ -0,0 +1,632 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Edge-case tests for TensorMap and Tensor overlap detection.
+ *
+ * ============================================================================
+ * ANALYSIS FINDINGS -- check_overlap() in PTO2TensorMapEntry
+ * ============================================================================
+ *
+ * BUG-CANDIDATE-1 (Overlap fast path): check_overlap() loops for
+ *   entry->ndims, reading input.shapes[i] for all i < entry->ndims.
+ *   When input has fewer dimensions, shapes[i] beyond input->ndims are
+ *   stale (Tensor::init only copies ndims elements).  The result is
+ *   non-deterministic -- depends on whatever value happens to be in memory.
+ *   The test poisons input.shapes[1] to make the stale read deterministic
+ *   and proves the loop bound is wrong.
+ *
+ * BUG-CANDIDATE-2 (Overlap slow path): The slow path constructs Segment from
+ *   offsets and shapes.  But it uses `uint64_t in_off = input.offsets[i]` when
+ *   `input.is_all_offset_zero` is false.  If ndims < RUNTIME_MAX_TENSOR_DIMS,
+ *   offsets[ndims..4] may be uninitialized garbage.  The loop runs for
+ *   entry->ndims iterations, which could exceed input->ndims.
+ *   -> Actually the loop runs for `ndims` which is the ENTRY's ndims.
+ *     If entry->ndims > input->ndims, input->shapes[i] beyond input->ndims is 0.
+ *     Segment{in_off, in_off + 0} has length 0 -> intersection is always false
+ *     -> returns NO_OVERLAP.  This might be wrong if the extra dimensions
+ *     are broadcast or don't exist.
+ *
+ * BUG-CANDIDATE-3 (Dimension mismatch): check_overlap uses entry->ndims
+ *   exclusively, ignoring input->ndims.  If input has MORE dimensions than
+ *   entry, the extra input dimensions are never checked.  This could miss
+ *   partial overlaps in higher dimensions.
+ *
+ * BUG-CANDIDATE-4 (Lookup result saturation): PTO2_LOOKUP_MAX_RESULTS = 16.
+ *   If more than 16 overlapping entries exist, results are silently dropped.
+ *   This means dependencies can be missed in highly-connected graphs.
+ *
+ * BUG-CANDIDATE-5 (TensorMap new_entry pool exhaustion): new_entry() calls
+ *   `always_assert(next_entry_idx < pool_size)` which throws/aborts when the
+ *   pool is fully used AND free_list is empty.  There's no graceful fallback.
+ *
+ * BUG-CANDIDATE-6 (Hash collision with cleanup): DISMISSED.
+ *   cleanup_retired() uses debug_assert to verify entry belongs to the
+ *   retiring task.  In theory, if the cleanup range exceeds task_window_size,
+ *   slot reuse causes ABA.  However, sync_tensormap()'s overlap check
+ *   (pto_tensormap.cpp:244) triggers cleanup every time the current task's
+ *   slot collides with last_cleanup, bounding the cleanup range to at most
+ *   task_window_size.  This guarantees each slot maps to exactly one task
+ *   in any cleanup pass.  The scenario is unreachable in production.
+ *
+ * BUG-CANDIDATE-7 (copy_from_tensor doesn't zero beyond ndims): When
+ *   copying shapes[]/offsets[] from Tensor to Entry, only ndims elements
+ *   are copied.  shapes[ndims..4] retain whatever was in the entry before
+ *   (from pool reuse).  check_overlap loops for entry->ndims, so garbage
+ *   data beyond ndims could affect overlap detection if the loop ever
+ *   reads beyond what was copied.  Currently safe because the loop uses
+ *   entry->ndims which matches what was copied, but fragile.
+ *
+ * ============================================================================
+ * ANALYSIS FINDINGS -- Tensor struct
+ * ============================================================================
+ *
+ * EDGE-1: Tensor with 0 dimensions (ndims=0).  No shapes/offsets.
+ *   check_overlap loop doesn't execute -> returns COVERED (fast path, contains=true).
+ *   Two 0-dim tensors at same addr are always "covered".
+ *
+ * EDGE-2: Tensor with maximum dimensions (ndims=5).
+ *   All shape/offset arrays fully used.
+ *
+ * EDGE-3: Shape of 0 in one dimension.  Segment = {off, off+0} = empty.
+ *   line_segment_intersection({off, off+0}, {x,y}) = (off+0 > x) && (y > off)
+ *   = (off > x) && (y > off).  Empty segment may or may not intersect.
+ *
+ * EDGE-4: Cleanup ABA -- DISMISSED.  sync_tensormap()'s overlap check
+ *   bounds cleanup range to at most task_window_size, so a single slot never
+ *   maps to two different tasks within one cleanup_retired() call.
+ */
+
+#include <gtest/gtest.h>
+#include <cstring>
+#include <set>
+#include "common.h"
+#include "pto_tensormap.h"
+#include "pto_orchestration_api.h"
+
+// =============================================================================
+// Helpers
+// =============================================================================
+
+static Tensor
+make_tensor_nd(uint64_t addr, uint32_t ndims, const uint32_t shapes[], const uint32_t offsets[], int32_t version = 0) {
+    // Seed with make_tensor_external() (Tensor's default ctor is private).
+    // Use a dummy 1-dim shape for the seed; we overwrite everything via init().
+    uint32_t seed_shape[1] = {1};
+    Tensor t = make_tensor_external(
+        reinterpret_cast<void *>(addr), seed_shape, 1, DataType::FLOAT32, /*manual_dep=*/false, /*version=*/0
+    );
+    uint32_t s[RUNTIME_MAX_TENSOR_DIMS]{};
+    uint32_t rs[RUNTIME_MAX_TENSOR_DIMS]{};
+    uint32_t o[RUNTIME_MAX_TENSOR_DIMS]{};
+    bool all_zero = true;
+    for (uint32_t i = 0; i < ndims && i < RUNTIME_MAX_TENSOR_DIMS; i++) {
+        s[i] = shapes[i];
+        rs[i] = shapes[i];
+        o[i] = offsets ? offsets[i] : 0;
+        if (o[i] != 0) all_zero = false;
+    }
+    uint64_t total = 4;
+    for (uint32_t i = 0; i < ndims; i++)
+        total *= (rs[i] + (offsets ? offsets[i] : 0));
+    t.init((void *)addr, total, rs, s, o, ndims, DataType::FLOAT32, version, all_zero, /*is_raw_eq_shapes=*/true);
+    return t;
+}
+
+class TensorMapEdgeTest : public ::testing::Test {
+protected:
+    PTO2TensorMap tmap{};
+    int32_t window_sizes[PTO2_MAX_RING_DEPTH]{};
+
+    void SetUp() override {
+        for (int i = 0; i < PTO2_MAX_RING_DEPTH; i++)
+            window_sizes[i] = 64;
+        ASSERT_TRUE(tmap.init(256, 512, window_sizes));
+    }
+    void TearDown() override { tmap.destroy(); }
+};
+
+
+// ---------------------------------------------------------------------------
+// EDGE-1: Zero dimensions (ndims=0)
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, ZeroDimensionTensor) {
+    uint32_t seed_shape[1] = {1};
+    Tensor t = make_tensor_external(reinterpret_cast<void *>(0x2000), seed_shape, 1, DataType::FLOAT32, false, 0);
+    uint32_t s[5]{}, o[5]{};
+    t.init((void *)0x2000, 0, s, s, o, 0, DataType::FLOAT32, 0, true, true);
+
+    PTO2TaskId task = PTO2TaskId::make(0, 0);
+    tmap.insert(t, task);
+
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(t, result);
+
+    EXPECT_GE(result.count, 1);
+    if (result.count > 0) {
+        // ndims=0: fast-path loop doesn't execute, contains=true -> COVERED
+        EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED);
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Zero dimensions: Two different 0-dim tensors at same address always COVERED
+// This is semantically questionable -- should scalar tensors be independent?
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, TwoZeroDimTensorsSameAddr) {
+    uint32_t seed_shape[1] = {1};
+    Tensor t1 = make_tensor_external(reinterpret_cast<void *>(0x2100), seed_shape, 1, DataType::FLOAT32, false, 0);
+    Tensor t2 = make_tensor_external(reinterpret_cast<void *>(0x2100), seed_shape, 1, DataType::FLOAT32, false, 0);
+    uint32_t s[5]{}, o[5]{};
+    t1.init((void *)0x2100, 0, s, s, o, 0, DataType::FLOAT32, 0, true, true);
+    t2.init((void *)0x2100, 0, s, s, o, 0, DataType::FLOAT32, 0, true, true);
+
+    tmap.insert(t1, PTO2TaskId::make(0, 0));
+    tmap.insert(t2, PTO2TaskId::make(0, 1));
+
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(t1, result);
+
+    // Both 0-dim entries report COVERED for any 0-dim input at same addr
+    EXPECT_EQ(result.count, 2);
+    for (int i = 0; i < result.count; i++) {
+        EXPECT_EQ(result.entries[i].overlap_status, OverlapStatus::COVERED)
+            << "0-dim tensors always report COVERED (empty loop -> contains=true)";
+    }
+}
+
+// ---------------------------------------------------------------------------
+// BUG-CANDIDATE-4: Lookup result saturation (>16 producers)
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, LookupResultSaturation) {
+    uint32_t shapes[] = {100};
+    Tensor t = make_tensor_nd(0x4000, 1, shapes, nullptr, 0);
+
+    // Insert 20 producers for the same tensor
+    for (int i = 0; i < 20; i++) {
+        tmap.insert(t, PTO2TaskId::make(0, i));
+    }
+
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(t, result);
+
+    // Only 16 results fit -- 4 dependencies are silently dropped
+    EXPECT_EQ(result.count, PTO2_LOOKUP_MAX_RESULTS)
+        << "More than 16 overlapping producers: results saturated, deps missed";
+}
+
+// ---------------------------------------------------------------------------
+// BUG-CANDIDATE-4 extended: Saturation drops OLDEST producers (newest first)
+// Because insert() adds at head of bucket chain, lookup traverses newest first.
+// The first 16 (newest) entries fill the result, dropping the 4 oldest.
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, LookupSaturationDropsOldest) {
+    uint32_t shapes[] = {100};
+    Tensor t = make_tensor_nd(0x4100, 1, shapes, nullptr, 0);
+
+    for (int i = 0; i < 20; i++) {
+        tmap.insert(t, PTO2TaskId::make(0, i));
+    }
+
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(t, result);
+
+    ASSERT_EQ(result.count, PTO2_LOOKUP_MAX_RESULTS);
+
+    // Verify the kept results are the newest 16 (tasks 19, 18, ..., 4)
+    // and the oldest 4 (tasks 0, 1, 2, 3) are dropped
+    for (int i = 0; i < result.count; i++) {
+        int32_t local_id = result.entries[i].entry->producer_task_id.local();
+        // The newest entries are inserted at head, so lookup sees them first
+        EXPECT_GE(local_id, 4) << "Oldest tasks (0-3) should be the ones dropped by saturation";
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Version-based overlap: newer version returns OTHER
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, VersionMismatchReturnsOther) {
+    uint32_t shapes[] = {100};
+    Tensor v0 = make_tensor_nd(0x5000, 1, shapes, nullptr, 0);
+    Tensor v1 = make_tensor_nd(0x5000, 1, shapes, nullptr, 1);
+
+    tmap.insert(v0, PTO2TaskId::make(0, 0));
+
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(v1, result);
+
+    EXPECT_EQ(result.count, 1);
+    // Version 1 > Version 0 -> OTHER (not COVERED)
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::OTHER);
+}
+
+// ---------------------------------------------------------------------------
+// Version: Same version, same shapes -> COVERED
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, SameVersionSameShapesCovered) {
+    uint32_t shapes[] = {100};
+    Tensor t = make_tensor_nd(0x5100, 1, shapes, nullptr, 0);
+
+    tmap.insert(t, PTO2TaskId::make(0, 0));
+
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(t, result);
+
+    EXPECT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED) << "Same version + same shapes -> COVERED";
+}
+
+// ---------------------------------------------------------------------------
+// Partial overlap 1D: [0:100] vs [50:150]
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, PartialOverlap1D) {
+    uint32_t prod_shapes[] = {100};
+    Tensor prod = make_tensor_nd(0x6000, 1, prod_shapes, nullptr, 0);
+    tmap.insert(prod, PTO2TaskId::make(0, 0));
+
+    // Consumer reads [50:150] -- partial overlap
+    uint32_t cons_shapes[] = {100};
+    uint32_t cons_offsets[] = {50};
+    Tensor cons = make_tensor_nd(0x6000, 1, cons_shapes, cons_offsets, 0);
+
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(cons, result);
+
+    EXPECT_EQ(result.count, 1);
+    // Consumer [50,150) vs Producer [0,100) -> intersection = [50,100).
+    // Consumer does NOT contain producer -> OTHER
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::OTHER);
+}
+
+// ---------------------------------------------------------------------------
+// Consumer fully covers producer: COVERED
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, ConsumerCoversProducer) {
+    // Producer writes [10:20]
+    uint32_t prod_shapes[] = {10};
+    uint32_t prod_offsets[] = {10};
+    Tensor prod = make_tensor_nd(0x7000, 1, prod_shapes, prod_offsets, 0);
+    tmap.insert(prod, PTO2TaskId::make(0, 0));
+
+    // Consumer reads [0:100] -- fully covers producer
+    uint32_t cons_shapes[] = {100};
+    Tensor cons = make_tensor_nd(0x7000, 1, cons_shapes, nullptr, 0);
+
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(cons, result);
+
+    EXPECT_EQ(result.count, 1);
+    // Consumer [0,100) contains Producer [10,20) -> COVERED
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED);
+}
+
+// ---------------------------------------------------------------------------
+// Adjacent regions: [0:100] vs [100:200] -> NO_OVERLAP
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, AdjacentNoOverlap) {
+    uint32_t prod_shapes[] = {100};
+    Tensor prod = make_tensor_nd(0x8000, 1, prod_shapes, nullptr, 0);
+    tmap.insert(prod, PTO2TaskId::make(0, 0));
+
+    uint32_t cons_shapes[] = {100};
+    uint32_t cons_offsets[] = {100};
+    Tensor cons = make_tensor_nd(0x8000, 1, cons_shapes, cons_offsets, 0);
+
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(cons, result);
+
+    // [0,100) vs [100,200) -> end(100) > begin(100)? No -> NO_OVERLAP
+    EXPECT_EQ(result.count, 0);
+}
+
+// ---------------------------------------------------------------------------
+// One-element overlap: [0:100] vs [99:199]
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, OneElementOverlap) {
+    uint32_t prod_shapes[] = {100};
+    Tensor prod = make_tensor_nd(0x8100, 1, prod_shapes, nullptr, 0);
+    tmap.insert(prod, PTO2TaskId::make(0, 0));
+
+    uint32_t cons_shapes[] = {100};
+    uint32_t cons_offsets[] = {99};
+    Tensor cons = make_tensor_nd(0x8100, 1, cons_shapes, cons_offsets, 0);
+
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(cons, result);
+
+    // [0,100) vs [99,199) -> intersection = [99,100) = 1 element
+    EXPECT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::OTHER) << "Partial overlap (1 element) -> OTHER";
+}
+
+// ---------------------------------------------------------------------------
+// EDGE-3: Shape of 0 in one dimension (empty segment behavior)
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, ZeroShapeInDimension) {
+    // Producer: 2D [10, 0] -- zero in dim 1
+    uint32_t prod_shapes[] = {10, 0};
+    Tensor prod = make_tensor_nd(0x8200, 2, prod_shapes, nullptr, 0);
+    tmap.insert(prod, PTO2TaskId::make(0, 0));
+
+    // Consumer: 2D [10, 20]
+    uint32_t cons_shapes[] = {10, 20};
+    Tensor cons = make_tensor_nd(0x8200, 2, cons_shapes, nullptr, 0);
+
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(cons, result);
+
+    if (result.count > 0) {
+        // Fast path: input.shapes[1](20) < entry.shapes[1](0)? No, 20 >= 0.
+        // -> contains = true -> COVERED.
+        // But the producer wrote ZERO elements in dim 1!
+        // Should a zero-area producer be "covered" by any consumer?
+        // This is semantically questionable.
+        EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED)
+            << "Zero-shape producer is COVERED by any consumer (empty production)";
+    }
+}
+
+// ---------------------------------------------------------------------------
+// 2D overlap: different slices
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, MultiDimOverlap) {
+    // Producer: 2D [10, 20] at offset [0, 0]
+    uint32_t prod_shapes[] = {10, 20};
+    Tensor prod = make_tensor_nd(0x9000, 2, prod_shapes, nullptr, 0);
+    tmap.insert(prod, PTO2TaskId::make(0, 0));
+
+    // Consumer: 2D [5, 10] at offset [2, 5] -- overlaps partially
+    uint32_t cons_shapes[] = {5, 10};
+    uint32_t cons_offsets[] = {2, 5};
+    Tensor cons = make_tensor_nd(0x9000, 2, cons_shapes, cons_offsets, 0);
+
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(cons, result);
+
+    EXPECT_EQ(result.count, 1);
+    // Consumer [2,7)x[5,15) vs Producer [0,10)x[0,20)
+    // check_overlap checks if INPUT(consumer) contains ENTRY(producer):
+    // Dim 0: consumer [2,7) does NOT contain producer [0,10) -> contains=false
+    // Dim 1: consumer [5,15) does NOT contain producer [0,20) -> contains=false
+    // All dims intersect, but consumer doesn't fully cover -> OTHER
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::OTHER)
+        << "Consumer sub-region inside producer: overlap exists but not COVERED";
+}
+
+// ---------------------------------------------------------------------------
+// 2D: Consumer exceeds producer in one dimension -> OTHER
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, MultiDimPartialOverlap) {
+    uint32_t prod_shapes[] = {10, 20};
+    Tensor prod = make_tensor_nd(0x9100, 2, prod_shapes, nullptr, 0);
+    tmap.insert(prod, PTO2TaskId::make(0, 0));
+
+    // Consumer: [8, 25] -- exceeds producer in dim 1 (25 > 20)
+    uint32_t cons_shapes[] = {8, 25};
+    Tensor cons = make_tensor_nd(0x9100, 2, cons_shapes, nullptr, 0);
+
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(cons, result);
+
+    EXPECT_EQ(result.count, 1);
+    // Fast path: shapes comparison
+    // input.shapes[0]=8 >= entry.shapes[0]=10? No -> contains=false -> OTHER
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::OTHER);
+}
+
+// ---------------------------------------------------------------------------
+// 5D full overlap test (maximum dimensions)
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, FullFiveDimensionalOverlap) {
+    uint32_t prod_shapes[] = {2, 3, 4, 5, 6};
+    Tensor prod = make_tensor_nd(0x9200, 5, prod_shapes, nullptr, 0);
+    tmap.insert(prod, PTO2TaskId::make(0, 0));
+
+    // Consumer with larger shapes in all dims -> COVERED
+    uint32_t cons_shapes[] = {4, 6, 8, 10, 12};
+    Tensor cons = make_tensor_nd(0x9200, 5, cons_shapes, nullptr, 0);
+
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(cons, result);
+
+    EXPECT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED)
+        << "5D consumer covers 5D producer in all dimensions";
+}
+
+// ---------------------------------------------------------------------------
+// Cleanup then insert: verify chain integrity
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, CleanupThenReuseSlot) {
+    uint32_t shapes[] = {100};
+    Tensor t = make_tensor_nd(0xA000, 1, shapes, nullptr, 0);
+
+    // Insert entries for tasks 0-7
+    for (int i = 0; i < 8; i++) {
+        tmap.insert(t, PTO2TaskId::make(0, i));
+    }
+    EXPECT_EQ(tmap.valid_count(), 8);
+
+    // Cleanup tasks 0-4
+    tmap.cleanup_retired(0, 0, 5);
+    tmap.sync_validity(0, 5);
+    EXPECT_EQ(tmap.valid_count(), 3);  // tasks 5,6,7 remain
+
+    // Re-insert with new task IDs that reuse slots 0-4
+    // (task window = 64, so IDs 64-68 map to slots 0-4)
+    for (int i = 64; i < 69; i++) {
+        tmap.insert(t, PTO2TaskId::make(0, i));
+    }
+
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(t, result);
+
+    // Should find 8 entries: 3 old (5,6,7) + 5 new (64-68)
+    EXPECT_EQ(result.count, 8);
+}
+
+// ---------------------------------------------------------------------------
+// Hash distribution: addresses that are multiples of common alignment
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, HashDistributionAlignedAddresses) {
+    // Typical device addresses are 256-byte or 1024-byte aligned
+    // The hash function should distribute these well
+    std::set<uint32_t> buckets_used;
+    for (int i = 0; i < 100; i++) {
+        uint64_t addr = 0x10000 + i * 1024;
+        uint32_t bucket = tmap.hash(addr);
+        buckets_used.insert(bucket);
+    }
+    // With 256 buckets and 100 addresses, we should use many distinct buckets
+    // (poor hash would cluster aligned addresses into few buckets)
+    EXPECT_GT(buckets_used.size(), 50u) << "Hash should distribute 1024-aligned addresses across many buckets";
+}
+
+// ---------------------------------------------------------------------------
+// Lookup on empty TensorMap
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, LookupEmpty) {
+    uint32_t shapes[] = {100};
+    Tensor t = make_tensor_nd(0xC000, 1, shapes, nullptr, 0);
+
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(t, result);
+
+    EXPECT_EQ(result.count, 0) << "Empty TensorMap returns no results";
+}
+
+// ---------------------------------------------------------------------------
+// Lazy invalidation: entries become stale when last_task_alive advances
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, LazyInvalidation) {
+    uint32_t shapes[] = {100};
+    Tensor t = make_tensor_nd(0xD000, 1, shapes, nullptr, 0);
+
+    // Insert entries for tasks 0-4
+    for (int i = 0; i < 5; i++) {
+        tmap.insert(t, PTO2TaskId::make(0, i));
+    }
+
+    // All 5 should be found
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(t, result);
+    EXPECT_EQ(result.count, 5);
+
+    // Advance validity threshold: tasks 0-2 become stale
+    tmap.sync_validity(0, 3);
+
+    result.count = 0;
+    tmap.lookup(t, result);
+    EXPECT_EQ(result.count, 2) << "Only tasks 3,4 are valid after sync_validity(3)";
+}
+
+// ---------------------------------------------------------------------------
+// entry_valid with different rings: ring isolation
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, RingIsolation) {
+    uint32_t shapes[] = {100};
+    Tensor t = make_tensor_nd(0xE000, 1, shapes, nullptr, 0);
+
+    // Insert in ring 0 (task 0) and ring 1 (task 0)
+    tmap.insert(t, PTO2TaskId::make(0, 0));
+    tmap.insert(t, PTO2TaskId::make(1, 0));
+
+    // Invalidate ring 0's tasks but not ring 1's
+    tmap.sync_validity(0, 1);
+
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(t, result);
+
+    // Only ring 1's entry should remain valid
+    EXPECT_EQ(result.count, 1);
+    if (result.count == 1) {
+        EXPECT_EQ(result.entries[0].entry->producer_task_id.ring(), 1)
+            << "Ring 0's entry is invalidated; ring 1's entry survives";
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Multiple tensors at different addresses: no cross-contamination
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, DifferentAddressesIsolated) {
+    uint32_t shapes[] = {100};
+    Tensor t1 = make_tensor_nd(0xF000, 1, shapes, nullptr, 0);
+    Tensor t2 = make_tensor_nd(0xF100, 1, shapes, nullptr, 0);
+
+    tmap.insert(t1, PTO2TaskId::make(0, 0));
+    tmap.insert(t2, PTO2TaskId::make(0, 1));
+
+    PTO2LookupResult result1;
+    result1.count = 0;
+    tmap.lookup(t1, result1);
+    EXPECT_EQ(result1.count, 1);
+
+    PTO2LookupResult result2;
+    result2.count = 0;
+    tmap.lookup(t2, result2);
+    EXPECT_EQ(result2.count, 1);
+
+    // Each lookup only finds its own producer
+    if (result1.count == 1 && result2.count == 1) {
+        EXPECT_NE(
+            result1.entries[0].entry->producer_task_id.local(), result2.entries[0].entry->producer_task_id.local()
+        );
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Free list recycling: after cleanup, new inserts reuse freed entries
+// without exhausting the pool. Verified via observable behavior (pool
+// doesn't exhaust) rather than internal pool index inspection.
+// ---------------------------------------------------------------------------
+TEST_F(TensorMapEdgeTest, FreeListRecycling) {
+    uint32_t shapes[] = {100};
+    Tensor t = make_tensor_nd(0x10000, 1, shapes, nullptr, 0);
+
+    // Insert 60 entries (within window_size=64, no slot collision)
+    for (int i = 0; i < 60; i++) {
+        tmap.insert(t, PTO2TaskId::make(0, i));
+    }
+
+    // Cleanup all 60 (range 0..60 < window_size=64, no ABA)
+    tmap.cleanup_retired(0, 0, 60);
+    tmap.sync_validity(0, 60);
+
+    // Insert another 60 -- should succeed because freed entries are reused
+    for (int i = 60; i < 120; i++) {
+        tmap.insert(t, PTO2TaskId::make(0, i));
+    }
+
+    // Verify via lookup: all 60 new entries should be findable
+    PTO2LookupResult result;
+    result.count = 0;
+    tmap.lookup(t, result);
+    // Capped at PTO2_LOOKUP_MAX_RESULTS=16, but count > 0 proves entries exist
+    EXPECT_EQ(result.count, PTO2_LOOKUP_MAX_RESULTS) << "After cleanup+reinsert, new entries are findable -- "
+                                                        "free list recycling keeps the pool from exhausting";
+}
diff --git a/tests/ut/cpp/test_a5_pto2_fatal.cpp b/tests/ut/cpp/pto2_a5/test_a5_pto2_fatal.cpp
similarity index 90%
rename from tests/ut/cpp/test_a5_pto2_fatal.cpp
rename to tests/ut/cpp/pto2_a5/test_a5_pto2_fatal.cpp
index 83d9483b1..2346d1911 100644
--- a/tests/ut/cpp/test_a5_pto2_fatal.cpp
+++ b/tests/ut/cpp/pto2_a5/test_a5_pto2_fatal.cpp
@@ -41,6 +41,8 @@ struct FakeRuntime {
     std::string last_fatal_message;
 };
 
+static_assert(offsetof(FakeRuntime, ops) == 0);  // Guard: reinterpret_cast below assumes ops is first member.
+
 FakeRuntime *as_fake(PTO2Runtime *rt) { return reinterpret_cast<FakeRuntime *>(rt); }
 
 TaskOutputTensors fake_submit(PTO2Runtime *rt, const MixedKernels &, const Arg &) {
@@ -87,20 +89,20 @@ TaskOutputTensors fake_alloc_tensors(PTO2Runtime *rt, const Arg &) {
 }
 
 const PTO2RuntimeOps kFakeOps = {
-    fake_submit,
-    fake_scope_begin,
-    fake_scope_end,
-    fake_orchestration_done,
-    fake_is_fatal,
-    fake_report_fatal,
-    fake_log,
-    fake_log,
-    fake_log,
-    fake_log,
-    fake_log,
-    fake_get_tensor_data,
-    fake_set_tensor_data,
-    fake_alloc_tensors,
+    .submit_task = fake_submit,
+    .scope_begin = fake_scope_begin,
+    .scope_end = fake_scope_end,
+    .orchestration_done = fake_orchestration_done,
+    .is_fatal = fake_is_fatal,
+    .report_fatal = fake_report_fatal,
+    .log_error = fake_log,
+    .log_warn = fake_log,
+    .log_info = fake_log,
+    .log_debug = fake_log,
+    .log_always = fake_log,
+    .get_tensor_data = fake_get_tensor_data,
+    .set_tensor_data = fake_set_tensor_data,
+    .alloc_tensors = fake_alloc_tensors,
 };
 
 class RuntimeBindingGuard {
diff --git a/tests/ut/cpp/stubs/test_stubs.cpp b/tests/ut/cpp/stubs/test_stubs.cpp
new file mode 100644
index 000000000..b9593ed08
--- /dev/null
+++ b/tests/ut/cpp/stubs/test_stubs.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Link-time stubs for platform APIs used by runtime headers.
+ *
+ * Provides x86-compatible implementations of functions declared in
+ * platform headers (unified_log.h, device_time.h, common.h) so that
+ * runtime data structures can be unit-tested on CI runners without
+ * Ascend hardware or SDK.
+ */
+
+#include <chrono>
+#include <cstdarg>
+#include <cstdint>
+#include <cstdio>
+#include <stdexcept>
+#include <string>
+
+// =============================================================================
+// unified_log.h stubs (5 log-level functions)
+// =============================================================================
+
+extern "C" {
+
+void unified_log_error(const char *func, const char *fmt, ...) {
+    va_list args;
+    va_start(args, fmt);
+    fprintf(stderr, "[ERROR] %s: ", func);
+    vfprintf(stderr, fmt, args);
+    fprintf(stderr, "\n");
+    va_end(args);
+}
+
+void unified_log_warn(const char *func, const char *fmt, ...) {
+    va_list args;
+    va_start(args, fmt);
+    fprintf(stderr, "[WARN]  %s: ", func);
+    vfprintf(stderr, fmt, args);
+    fprintf(stderr, "\n");
+    va_end(args);
+}
+
+void unified_log_info(const char * /* func */, const char * /* fmt */, ...) {
+    // Suppress info in tests
+}
+
+void unified_log_debug(const char * /* func */, const char * /* fmt */, ...) {
+    // Suppress debug in tests
+}
+
+void unified_log_always(const char *func, const char *fmt, ...) {
+    va_list args;
+    va_start(args, fmt);
+    fprintf(stderr, "[ALWAYS] %s: ", func);
+    vfprintf(stderr, fmt, args);
+    fprintf(stderr, "\n");
+    va_end(args);
+}
+
+}  // extern "C"
+
+// =============================================================================
+// device_time.h stub
+// =============================================================================
+
+uint64_t get_sys_cnt_aicpu() {
+    auto now = std::chrono::steady_clock::now();
+    return static_cast<uint64_t>(std::chrono::duration_cast<std::chrono::nanoseconds>(now.time_since_epoch()).count());
+}
+
+// =============================================================================
+// common.h stubs (assert_impl, get_stacktrace, AssertionError)
+// =============================================================================
+
+std::string get_stacktrace(int /* skip_frames */) { return "<stacktrace not available in test stubs>"; }
+
+class AssertionError : public std::runtime_error {
+public:
+    AssertionError(const char *condition, const char *file, int line) :
+        std::runtime_error(std::string("Assertion failed: ") + condition + " at " + file + ":" + std::to_string(line)),
+        condition_(condition),
+        file_(file),
+        line_(line) {}
+
+    const char *condition() const { return condition_; }
+    const char *file() const { return file_; }
+    int line() const { return line_; }
+
+private:
+    const char *condition_;
+    const char *file_;
+    int line_;
+};
+
+[[noreturn]] void assert_impl(const char *condition, const char *file, int line) {
+    throw AssertionError(condition, file, line);
+}
diff --git a/tests/ut/cpp/test_helpers.h b/tests/ut/cpp/test_helpers.h
new file mode 100644
index 000000000..a4244c9a2
--- /dev/null
+++ b/tests/ut/cpp/test_helpers.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Shared test helper utilities for C++ unit tests.
+ *
+ * Provides convenience functions that initialize internal data structures
+ * from user-supplied buffers, avoiding direct field manipulation in tests.
+ */
+#pragma once
+
+#include "pto_scheduler.h"
+
+/**
+ * Initialize a ReadyQueue with a caller-provided slot buffer and start sequence.
+ *
+ * Unlike pto2_ready_queue_init() which malloc's its own buffer and starts at 0,
+ * this helper uses a stack-allocated buffer and supports arbitrary start sequences
+ * (needed for sequence-wrap tests).
+ */
+inline void
+test_ready_queue_init(PTO2ReadyQueue *queue, PTO2ReadyQueueSlot *slots, uint64_t capacity, int64_t start_seq = 0) {
+    queue->slots = slots;
+    queue->capacity = capacity;
+    queue->mask = capacity - 1;
+    queue->enqueue_pos.store(start_seq, std::memory_order_relaxed);
+    queue->dequeue_pos.store(start_seq, std::memory_order_relaxed);
+    for (uint64_t i = 0; i < capacity; i++) {
+        int64_t pos = start_seq + (int64_t)i;
+        uint64_t idx = (uint64_t)pos & (capacity - 1);
+        slots[idx].sequence.store(pos, std::memory_order_relaxed);
+        slots[idx].slot_state = nullptr;
+    }
+}
diff --git a/tests/ut/cpp/test_child_memory.cpp b/tests/ut/cpp/types/test_child_memory.cpp
similarity index 98%
rename from tests/ut/cpp/test_child_memory.cpp
rename to tests/ut/cpp/types/test_child_memory.cpp
index 2ac7073a2..418cfdc7c 100644
--- a/tests/ut/cpp/test_child_memory.cpp
+++ b/tests/ut/cpp/types/test_child_memory.cpp
@@ -20,6 +20,7 @@
 // ContinuousTensor layout
 // ---------------------------------------------------------------------------
 
+// ABI contract: size must match serialization format.
 TEST(ChildMemory, SizeofUnchanged) { EXPECT_EQ(sizeof(ContinuousTensor), 40u); }
 
 TEST(ChildMemory, DefaultIsZero) {