hw-native-sys · ChaoWao · Apr 23, 2026 · Apr 23, 2026
diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt
@@ -74,6 +74,49 @@ if(NOT GTEST_LIB OR NOT GTEST_MAIN_LIB)
     set(GTEST_INCLUDE_DIRS "")  # include dirs are carried by the gtest target
 endif()
 
+# ---------------------------------------------------------------------------
+# A2A3 runtime sources and stubs for ring-buffer / tensormap tests
+# ---------------------------------------------------------------------------
+set(A2A3_RUNTIME_DIR ${CMAKE_SOURCE_DIR}/../../../src/a2a3/runtime/tensormap_and_ringbuffer/runtime)
+set(A2A3_STUB_SOURCES ${CMAKE_SOURCE_DIR}/stubs/test_stubs.cpp)
+set(A2A3_RUNTIME_SOURCES
+    ${A2A3_RUNTIME_DIR}/pto_ring_buffer.cpp
+    ${A2A3_RUNTIME_DIR}/pto_shared_memory.cpp
+    ${A2A3_RUNTIME_DIR}/pto_scheduler.cpp
+    ${A2A3_RUNTIME_DIR}/pto_tensormap.cpp
+)
+
+set(A2A3_COMMON_INCLUDE_DIRS
+    ${CMAKE_SOURCE_DIR}/../../../src/a2a3/runtime/tensormap_and_ringbuffer/orchestration
+    ${CMAKE_SOURCE_DIR}/../../../src/a2a3/runtime/tensormap_and_ringbuffer/runtime
+    ${CMAKE_SOURCE_DIR}/../../../src/a2a3/runtime/tensormap_and_ringbuffer/common
+    ${CMAKE_SOURCE_DIR}/../../../src/a2a3/platform/include
+    ${CMAKE_SOURCE_DIR}/../../../src/common/task_interface
+)
+
+function(add_a2a3_runtime_test name)
+    cmake_parse_arguments(ARG "" "" "SOURCES;EXTRA_SOURCES" ${ARGN})
+    set(_all_sources ${ARG_SOURCES} ${A2A3_STUB_SOURCES})
+    foreach(src ${ARG_SOURCES} ${ARG_EXTRA_SOURCES})
+        if(EXISTS ${src})
+            list(APPEND _all_sources ${src})
+        endif()
+    endforeach()
+    add_executable(${name} ${_all_sources})
+    target_include_directories(${name} PRIVATE
+        ${GTEST_INCLUDE_DIRS}
+        ${A2A3_COMMON_INCLUDE_DIRS}
+    )
+    target_compile_options(${name} PRIVATE -D_GLIBCXX_USE_CXX11_ABI=0)
+    target_link_libraries(${name} PRIVATE
+        ${GTEST_MAIN_LIB}
+        ${GTEST_LIB}
+        pthread
+    )
+    add_test(NAME ${name} COMMAND ${name})
+    set_tests_properties(${name} PROPERTIES LABELS "no_hardware")
+endfunction()
+
 # ---------------------------------------------------------------------------
 # Distributed runtime sources under test
 # ---------------------------------------------------------------------------
@@ -113,7 +156,7 @@ function(add_hierarchical_test name src)
     add_test(NAME ${name} COMMAND ${name})
 endfunction()
 
-function(add_a2a3_pto2_test name src)
+function(add_a2a3_test name src)
     add_executable(${name} ${src})
     target_include_directories(${name} PRIVATE
         ${GTEST_INCLUDE_DIRS}
@@ -132,7 +175,7 @@ function(add_a2a3_pto2_test name src)
     add_test(NAME ${name} COMMAND ${name})
 endfunction()
 
-function(add_a5_pto2_test name src)
+function(add_a5_test name src)
     add_executable(${name} ${src})
     target_include_directories(${name} PRIVATE
         ${GTEST_INCLUDE_DIRS}
@@ -151,13 +194,6 @@ function(add_a5_pto2_test name src)
     add_test(NAME ${name} COMMAND ${name})
 endfunction()
 
-enable_testing()
-
-add_hierarchical_test(test_tensormap  test_tensormap.cpp)
-add_hierarchical_test(test_ring  test_ring.cpp)
-add_hierarchical_test(test_scope      test_scope.cpp)
-add_hierarchical_test(test_orchestrator test_orchestrator.cpp)
-add_hierarchical_test(test_scheduler  test_scheduler.cpp)
 function(add_task_interface_test name src)
     add_executable(${name} ${src})
     target_include_directories(${name} PRIVATE
@@ -173,9 +209,31 @@ function(add_task_interface_test name src)
     add_test(NAME ${name} COMMAND ${name})
 endfunction()
 
-add_task_interface_test(test_child_memory test_child_memory.cpp)
-add_a2a3_pto2_test(test_a2a3_pto2_fatal test_a2a3_pto2_fatal.cpp)
-add_a5_pto2_test(test_a5_pto2_fatal test_a5_pto2_fatal.cpp)
+enable_testing()
+
+# ---------------------------------------------------------------------------
+# Hierarchical runtime tests (src/common/hierarchical/)
+# ---------------------------------------------------------------------------
+add_hierarchical_test(test_tensormap  hierarchical/test_tensormap.cpp)
+add_hierarchical_test(test_ring  hierarchical/test_ring.cpp)
+add_hierarchical_test(test_scope      hierarchical/test_scope.cpp)
+add_hierarchical_test(test_orchestrator hierarchical/test_orchestrator.cpp)
+add_hierarchical_test(test_scheduler  hierarchical/test_scheduler.cpp)
+
+# ---------------------------------------------------------------------------
+# Types / task_interface tests (src/common/task_interface/)
+# ---------------------------------------------------------------------------
+add_task_interface_test(test_child_memory types/test_child_memory.cpp)
+
+# ---------------------------------------------------------------------------
+# A2A3 tests (src/a2a3/runtime/tensormap_and_ringbuffer/)
+# ---------------------------------------------------------------------------
+add_a2a3_test(test_a2a3_fatal a2a3/test_a2a3_fatal.cpp)
+
+# ---------------------------------------------------------------------------
+# A5 tests (src/a5/runtime/tensormap_and_ringbuffer/)
+# ---------------------------------------------------------------------------
+add_a5_test(test_a5_fatal a5/test_a5_fatal.cpp)
 
 # Hardware-gated tests.  Block is only entered when the project is configured
 # with -DSIMPLER_ENABLE_HARDWARE_TESTS=ON.  CI's no-hw `ut` job does not pass
@@ -238,5 +296,5 @@ if(SIMPLER_ENABLE_HARDWARE_TESTS)
         )
     endfunction()
 
-    add_comm_api_test(test_hccl_comm test_hccl_comm.cpp)
+    add_comm_api_test(test_hccl_comm hardware/test_hccl_comm.cpp)
 endif()
diff --git a/tests/ut/cpp/test_a5_pto2_fatal.cpp → tests/ut/cpp/a2a3/test_a2a3_fatal.cpp b/tests/ut/cpp/test_a5_pto2_fatal.cpp → tests/ut/cpp/a2a3/test_a2a3_fatal.cpp
@@ -41,6 +41,8 @@ struct FakeRuntime {
     std::string last_fatal_message;
 };
 
+static_assert(offsetof(FakeRuntime, ops) == 0);  // Guard: reinterpret_cast below assumes ops is first member.
+
 FakeRuntime *as_fake(PTO2Runtime *rt) { return reinterpret_cast<FakeRuntime *>(rt); }
 
 TaskOutputTensors fake_submit(PTO2Runtime *rt, const MixedKernels &, const Arg &) {
@@ -87,20 +89,20 @@ TaskOutputTensors fake_alloc_tensors(PTO2Runtime *rt, const Arg &) {
 }
 
 const PTO2RuntimeOps kFakeOps = {
-    fake_submit,
-    fake_scope_begin,
-    fake_scope_end,
-    fake_orchestration_done,
-    fake_is_fatal,
-    fake_report_fatal,
-    fake_log,
-    fake_log,
-    fake_log,
-    fake_log,
-    fake_log,
-    fake_get_tensor_data,
-    fake_set_tensor_data,
-    fake_alloc_tensors,
+    .submit_task = fake_submit,
+    .scope_begin = fake_scope_begin,
+    .scope_end = fake_scope_end,
+    .orchestration_done = fake_orchestration_done,
+    .is_fatal = fake_is_fatal,
+    .report_fatal = fake_report_fatal,
+    .log_error = fake_log,
+    .log_warn = fake_log,
+    .log_info = fake_log,
+    .log_debug = fake_log,
+    .log_always = fake_log,
+    .get_tensor_data = fake_get_tensor_data,
+    .set_tensor_data = fake_set_tensor_data,
+    .alloc_tensors = fake_alloc_tensors,
 };
 
 class RuntimeBindingGuard {
@@ -116,7 +118,7 @@ TensorCreateInfo make_ci() {
 
 }  // namespace
 
-TEST(A5PTO2Fatal, ApiShortCircuitsAfterFatal) {
+TEST(A2A3Fatal, ApiShortCircuitsAfterFatal) {
     FakeRuntime runtime{};
     runtime.ops = &kFakeOps;
     runtime.fatal = true;
@@ -148,7 +150,7 @@ TEST(A5PTO2Fatal, ApiShortCircuitsAfterFatal) {
     EXPECT_EQ(runtime.report_fatal_calls, 0);
 }
 
-TEST(A5PTO2Fatal, ExplicitFatalRoutesThroughOps) {
+TEST(A2A3Fatal, ExplicitFatalRoutesThroughOps) {
     FakeRuntime runtime{};
     runtime.ops = &kFakeOps;
     RuntimeBindingGuard bind(reinterpret_cast<PTO2Runtime *>(&runtime));
@@ -167,7 +169,7 @@ TEST(A5PTO2Fatal, ExplicitFatalRoutesThroughOps) {
     EXPECT_EQ(runtime.submit_calls, 0);
 }
 
-TEST(A5PTO2Fatal, AllocTensorConvenienceReportsInvalidArgsInsteadOfAsserting) {
+TEST(A2A3Fatal, AllocTensorConvenienceReportsInvalidArgsInsteadOfAsserting) {
     FakeRuntime runtime{};
     runtime.ops = &kFakeOps;
     RuntimeBindingGuard bind(reinterpret_cast<PTO2Runtime *>(&runtime));

diff --git a/tests/ut/cpp/test_a2a3_pto2_fatal.cpp → tests/ut/cpp/a5/test_a5_fatal.cpp b/tests/ut/cpp/test_a2a3_pto2_fatal.cpp → tests/ut/cpp/a5/test_a5_fatal.cpp
@@ -41,6 +41,8 @@ struct FakeRuntime {
     std::string last_fatal_message;
 };
 
+static_assert(offsetof(FakeRuntime, ops) == 0);  // Guard: reinterpret_cast below assumes ops is first member.
+
 FakeRuntime *as_fake(PTO2Runtime *rt) { return reinterpret_cast<FakeRuntime *>(rt); }
 
 TaskOutputTensors fake_submit(PTO2Runtime *rt, const MixedKernels &, const Arg &) {
@@ -87,20 +89,20 @@ TaskOutputTensors fake_alloc_tensors(PTO2Runtime *rt, const Arg &) {
 }
 
 const PTO2RuntimeOps kFakeOps = {
-    fake_submit,
-    fake_scope_begin,
-    fake_scope_end,
-    fake_orchestration_done,
-    fake_is_fatal,
-    fake_report_fatal,
-    fake_log,
-    fake_log,
-    fake_log,
-    fake_log,
-    fake_log,
-    fake_get_tensor_data,
-    fake_set_tensor_data,
-    fake_alloc_tensors,
+    .submit_task = fake_submit,
+    .scope_begin = fake_scope_begin,
+    .scope_end = fake_scope_end,
+    .orchestration_done = fake_orchestration_done,
+    .is_fatal = fake_is_fatal,
+    .report_fatal = fake_report_fatal,
+    .log_error = fake_log,
+    .log_warn = fake_log,
+    .log_info = fake_log,
+    .log_debug = fake_log,
+    .log_always = fake_log,
+    .get_tensor_data = fake_get_tensor_data,
+    .set_tensor_data = fake_set_tensor_data,
+    .alloc_tensors = fake_alloc_tensors,
 };
 
 class RuntimeBindingGuard {
@@ -116,7 +118,7 @@ TensorCreateInfo make_ci() {
 
 }  // namespace
 
-TEST(A2A3PTO2Fatal, ApiShortCircuitsAfterFatal) {
+TEST(A5Fatal, ApiShortCircuitsAfterFatal) {
     FakeRuntime runtime{};
     runtime.ops = &kFakeOps;
     runtime.fatal = true;
@@ -148,7 +150,7 @@ TEST(A2A3PTO2Fatal, ApiShortCircuitsAfterFatal) {
     EXPECT_EQ(runtime.report_fatal_calls, 0);
 }
 
-TEST(A2A3PTO2Fatal, ExplicitFatalRoutesThroughOps) {
+TEST(A5Fatal, ExplicitFatalRoutesThroughOps) {
     FakeRuntime runtime{};
     runtime.ops = &kFakeOps;
     RuntimeBindingGuard bind(reinterpret_cast<PTO2Runtime *>(&runtime));
@@ -167,7 +169,7 @@ TEST(A2A3PTO2Fatal, ExplicitFatalRoutesThroughOps) {
     EXPECT_EQ(runtime.submit_calls, 0);
 }
 
-TEST(A2A3PTO2Fatal, AllocTensorConvenienceReportsInvalidArgsInsteadOfAsserting) {
+TEST(A5Fatal, AllocTensorConvenienceReportsInvalidArgsInsteadOfAsserting) {
     FakeRuntime runtime{};
     runtime.ops = &kFakeOps;
     RuntimeBindingGuard bind(reinterpret_cast<PTO2Runtime *>(&runtime));

diff --git a/tests/ut/cpp/test_hccl_comm.cpp → tests/ut/cpp/hardware/test_hccl_comm.cpp b/tests/ut/cpp/test_hccl_comm.cpp → tests/ut/cpp/hardware/test_hccl_comm.cpp
@@ -12,12 +12,12 @@
 /*
  * Hardware UT guarding the CANN/HCCL-private ABI coupling in comm_hccl.cpp.
  *
- * The call chain (dlopen → create_device_context → ensure_acl_ready_ctx →
- * aclrtCreateStream → comm_init → comm_alloc_windows → ...) is not the
- * interesting part — the interesting part is *what's inside* CommContext
+ * The call chain (dlopen -> create_device_context -> ensure_acl_ready_ctx ->
+ * aclrtCreateStream -> comm_init -> comm_alloc_windows -> ...) is not the
+ * interesting part -- the interesting part is *what's inside* CommContext
  * after comm_alloc_windows returns.  That struct comes from one of:
  *
- *   - MESH topology: `reinterpret_cast<CommContext*>(HCCL's return ptr)` —
+ *   - MESH topology: `reinterpret_cast<CommContext*>(HCCL's return ptr)` --
  *     our layout is *assumed* to match HCCL's internal MESH context.
  *   - RING topology: our parser reads HcclOpResParam / HcclRankRelationResV2
  *     field-by-field using offsetof against reverse-engineered struct defs.
@@ -40,7 +40,7 @@
  * gate SIMPLER_ENABLE_HARDWARE_TESTS.  Device allocation is driven by
  * CTest RESOURCE_GROUPS + --resource-spec-file.
  *
- * Linking strategy: libhost_runtime.so is dlopen'd — it is the subject
+ * Linking strategy: libhost_runtime.so is dlopen'd -- it is the subject
  * under test and mirrors how ChipWorker loads a runtime backend in
  * production.  libascendcl.so is linked directly at compile time because
  * it is generic CANN infra; going through dlsym for acl* here buys nothing
@@ -122,14 +122,14 @@ constexpr int EXIT_WINDOW_SIZE = 50;
 // the CommContext returned by HCCL (MESH reinterpret_cast) or built by our
 // RING parser actually contains the fields we expect at the offsets we
 // expect.  Failure here means our reverse-engineered CANN ABI disagrees with
-// the live HCCL build — the CANN-coupling fragility this test is here for.
+// the live HCCL build -- the CANN-coupling fragility this test is here for.
 constexpr int EXIT_CTX_MEMCPY = 55;
 constexpr int EXIT_CTX_FIELDS = 56;
 constexpr int EXIT_BARRIER = 60;
 constexpr int EXIT_DESTROY = 70;
 
 int run_rank(int rank, int nranks, int device_id, const char *rootinfo_path) {
-    // libhost_runtime.so is the subject under test — dlopen mirrors
+    // libhost_runtime.so is the subject under test -- dlopen mirrors
     // ChipWorker.  libascendcl is linked in, so acl* is available directly.
     void *host_handle = dlopen(PTO_HOST_RUNTIME_LIB_PATH, RTLD_NOW | RTLD_LOCAL);
     if (host_handle == nullptr) {
@@ -215,7 +215,7 @@ int run_rank(int rank, int nranks, int device_id, const char *rootinfo_path) {
                                host_ctx.windowsIn[rank] != local_base) {
                         fprintf(
                             stderr,
-                            "[rank %d] CommContext field mismatch — CANN ABI drift?\n"
+                            "[rank %d] CommContext field mismatch -- CANN ABI drift?\n"
                             "  got:      rankId=%u rankNum=%u winSize=%lu windowsIn[%d]=0x%lx\n"
                             "  expected: rankId=%d rankNum=%d winSize=%zu windowsIn[%d]=0x%lx\n",
                             rank, host_ctx.rankId, host_ctx.rankNum, static_cast<unsigned long>(host_ctx.winSize), rank,

diff --git a/tests/ut/cpp/test_orchestrator.cpp → ...ut/cpp/hierarchical/test_orchestrator.cpp b/tests/ut/cpp/test_orchestrator.cpp → ...ut/cpp/hierarchical/test_orchestrator.cpp
@@ -48,7 +48,7 @@ struct OrchestratorFixture : public ::testing::Test {
 
     void TearDown() override { allocator.shutdown(); }
 
-    // Per-slot accessor — slot state lives inside the Ring now.
+    // Per-slot accessor -- slot state lives inside the Ring now.
     TaskSlotState &S(TaskSlot id) { return *allocator.slot_state(id); }
 
     // Helper: build a TaskArgs whose only tensor has the given (data, tag).
@@ -86,7 +86,7 @@ TEST_F(OrchestratorFixture, DependentTaskIsPending) {
     TaskSlot a_slot;
     rq.try_pop(a_slot);
 
-    // Task B reads INPUT at the same key — depends on A
+    // Task B reads INPUT at the same key -- depends on A
     auto args_b = single_tensor_args(0xBEEF, TensorArgType::INPUT);
     auto b = orch.submit_next_level(0xDEAD, args_b, cfg);
     EXPECT_EQ(S(b.task_slot).state.load(), TaskState::PENDING);
@@ -151,7 +151,7 @@ TEST_F(OrchestratorFixture, NoDepTagSkipsDependencyTracking) {
     TaskSlot drain_slot;
     rq.try_pop(drain_slot);
 
-    // Second task references same key but tagged NO_DEP — should be independent
+    // Second task references same key but tagged NO_DEP -- should be independent
     auto args_b = single_tensor_args(0xAAAA, TensorArgType::NO_DEP);
     auto b = orch.submit_next_level(0xDEAD, args_b, cfg);
     EXPECT_EQ(S(b.task_slot).state.load(), TaskState::READY);
@@ -215,7 +215,7 @@ TEST_F(OrchestratorFixture, OutputAutoAllocsFromHeapRing) {
 
 TEST_F(OrchestratorFixture, InoutWiresCreatorAsFanin) {
     // INOUT is the only tag that pulls in the prior writer as a fanin
-    // producer — matching L2's pto_orchestrator.cpp Step B where only
+    // producer -- matching L2's pto_orchestrator.cpp Step B where only
     // INPUT / INOUT do tensor_map.lookup. Users who want a WaW dep on
     // the alloc-slot (so its HeapRing slab stays live while they write)
     // must tag the buffer INOUT.
@@ -250,7 +250,7 @@ TEST_F(OrchestratorFixture, InoutWiresCreatorAsFanin) {
 
 TEST_F(OrchestratorFixture, OutputAndOutputExistingAreInsertOnly) {
     // Contrast with INOUT: plain OUTPUT and OUTPUT_EXISTING are pure
-    // overwrites — insert into TensorMap, no lookup, so no fanin wire
+    // overwrites -- insert into TensorMap, no lookup, so no fanin wire
     // on the prior writer. Matches L2 semantics for both tags. Users
     // who need creator lifetime must tag the buffer INOUT.
     struct Case {