Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
156 changes: 145 additions & 11 deletions tests/ut/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,49 @@ if(NOT GTEST_LIB OR NOT GTEST_MAIN_LIB)
set(GTEST_INCLUDE_DIRS "") # include dirs are carried by the gtest target
endif()

# ---------------------------------------------------------------------------
# PTO2 runtime sources and stubs for a2a3 ring-buffer / tensormap tests
# ---------------------------------------------------------------------------
set(A2A3_RUNTIME_DIR ${CMAKE_SOURCE_DIR}/../../../src/a2a3/runtime/tensormap_and_ringbuffer/runtime)
set(PTO2_STUB_SOURCES ${CMAKE_SOURCE_DIR}/stubs/test_stubs.cpp)
set(PTO2_RUNTIME_SOURCES
${A2A3_RUNTIME_DIR}/pto_ring_buffer.cpp
${A2A3_RUNTIME_DIR}/pto_shared_memory.cpp
${A2A3_RUNTIME_DIR}/pto_scheduler.cpp
${A2A3_RUNTIME_DIR}/pto_tensormap.cpp
)

set(PTO2_COMMON_INCLUDE_DIRS
${CMAKE_SOURCE_DIR}/../../../src/a2a3/runtime/tensormap_and_ringbuffer/orchestration
${CMAKE_SOURCE_DIR}/../../../src/a2a3/runtime/tensormap_and_ringbuffer/runtime
${CMAKE_SOURCE_DIR}/../../../src/a2a3/runtime/tensormap_and_ringbuffer/common
${CMAKE_SOURCE_DIR}/../../../src/a2a3/platform/include
${CMAKE_SOURCE_DIR}/../../../src/common/task_interface
)

function(add_a2a3_pto2_runtime_test name)
cmake_parse_arguments(ARG "" "" "SOURCES;EXTRA_SOURCES" ${ARGN})
set(_all_sources ${ARG_SOURCES} ${PTO2_STUB_SOURCES})
foreach(src ${ARG_SOURCES} ${ARG_EXTRA_SOURCES})
if(EXISTS ${src})
list(APPEND _all_sources ${src})
endif()
endforeach()
add_executable(${name} ${_all_sources})
target_include_directories(${name} PRIVATE
${GTEST_INCLUDE_DIRS}
${PTO2_COMMON_INCLUDE_DIRS}
)
target_compile_options(${name} PRIVATE -D_GLIBCXX_USE_CXX11_ABI=0)
target_link_libraries(${name} PRIVATE
${GTEST_MAIN_LIB}
${GTEST_LIB}
pthread
)
add_test(NAME ${name} COMMAND ${name})
set_tests_properties(${name} PROPERTIES LABELS "no_hardware")
endfunction()

# ---------------------------------------------------------------------------
# Distributed runtime sources under test
# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -151,13 +194,6 @@ function(add_a5_pto2_test name src)
add_test(NAME ${name} COMMAND ${name})
endfunction()

enable_testing()

add_hierarchical_test(test_tensormap test_tensormap.cpp)
add_hierarchical_test(test_ring test_ring.cpp)
add_hierarchical_test(test_scope test_scope.cpp)
add_hierarchical_test(test_orchestrator test_orchestrator.cpp)
add_hierarchical_test(test_scheduler test_scheduler.cpp)
function(add_task_interface_test name src)
add_executable(${name} ${src})
target_include_directories(${name} PRIVATE
Expand All @@ -173,9 +209,107 @@ function(add_task_interface_test name src)
add_test(NAME ${name} COMMAND ${name})
endfunction()

add_task_interface_test(test_child_memory test_child_memory.cpp)
add_a2a3_pto2_test(test_a2a3_pto2_fatal test_a2a3_pto2_fatal.cpp)
add_a5_pto2_test(test_a5_pto2_fatal test_a5_pto2_fatal.cpp)
enable_testing()

# ---------------------------------------------------------------------------
# Hierarchical runtime tests (src/common/hierarchical/)
# ---------------------------------------------------------------------------
add_hierarchical_test(test_tensormap hierarchical/test_tensormap.cpp)
add_hierarchical_test(test_ring hierarchical/test_ring.cpp)
add_hierarchical_test(test_scope hierarchical/test_scope.cpp)
add_hierarchical_test(test_orchestrator hierarchical/test_orchestrator.cpp)
add_hierarchical_test(test_scheduler hierarchical/test_scheduler.cpp)
add_hierarchical_test(test_worker_manager hierarchical/test_worker_manager.cpp)

# ---------------------------------------------------------------------------
# Types / task_interface tests (src/common/task_interface/)
# ---------------------------------------------------------------------------
add_task_interface_test(test_child_memory types/test_child_memory.cpp)
add_a2a3_pto2_runtime_test(test_pto_types
SOURCES types/test_pto_types.cpp
)
add_a2a3_pto2_runtime_test(test_tensor
SOURCES types/test_tensor.cpp
)

# ---------------------------------------------------------------------------
# PTO2 A2A3 tests (src/a2a3/runtime/tensormap_and_ringbuffer/)
# ---------------------------------------------------------------------------
add_a2a3_pto2_test(test_a2a3_pto2_fatal pto2_a2a3/test_a2a3_pto2_fatal.cpp)
add_a2a3_pto2_test(test_runtime_status pto2_a2a3/test_runtime_status.cpp)
add_a2a3_pto2_test(test_core_types pto2_a2a3/test_core_types.cpp)
add_a2a3_pto2_test(test_dispatch_payload pto2_a2a3/test_dispatch_payload.cpp)
add_a2a3_pto2_test(test_handshake pto2_a2a3/test_handshake.cpp)
add_a2a3_pto2_test(test_submit_types pto2_a2a3/test_submit_types.cpp)

# ---------------------------------------------------------------------------
# PTO2 A5 tests (src/a5/runtime/tensormap_and_ringbuffer/)
# ---------------------------------------------------------------------------
add_a5_pto2_test(test_a5_pto2_fatal pto2_a5/test_a5_pto2_fatal.cpp)

# ---------------------------------------------------------------------------
# Platform sim layer tests
# ---------------------------------------------------------------------------
set(A2A3_PLATFORM_SIM_DIR ${CMAKE_SOURCE_DIR}/../../../src/a2a3/platform/sim)
set(A2A3_PLATFORM_HOST_DIR ${CMAKE_SOURCE_DIR}/../../../src/a2a3/platform/src/host)
set(A2A3_PLATFORM_INCLUDE ${CMAKE_SOURCE_DIR}/../../../src/a2a3/platform/include)

add_executable(test_platform_memory_allocator
platform/test_platform_memory_allocator.cpp
${A2A3_PLATFORM_SIM_DIR}/host/memory_allocator.cpp
${PTO2_STUB_SOURCES}
)
target_include_directories(test_platform_memory_allocator PRIVATE
${GTEST_INCLUDE_DIRS}
${A2A3_PLATFORM_INCLUDE}
)
target_compile_options(test_platform_memory_allocator PRIVATE -D_GLIBCXX_USE_CXX11_ABI=0)
target_link_libraries(test_platform_memory_allocator PRIVATE
${GTEST_MAIN_LIB}
${GTEST_LIB}
pthread
)
add_test(NAME test_platform_memory_allocator COMMAND test_platform_memory_allocator)
set_tests_properties(test_platform_memory_allocator PROPERTIES LABELS "no_hardware")

add_executable(test_platform_host_log
platform/test_platform_host_log.cpp
${A2A3_PLATFORM_HOST_DIR}/host_log.cpp
)
target_include_directories(test_platform_host_log PRIVATE
${GTEST_INCLUDE_DIRS}
${A2A3_PLATFORM_HOST_DIR}
)
target_compile_options(test_platform_host_log PRIVATE -D_GLIBCXX_USE_CXX11_ABI=0)
target_link_libraries(test_platform_host_log PRIVATE
${GTEST_MAIN_LIB}
${GTEST_LIB}
pthread
)
add_test(NAME test_platform_host_log COMMAND test_platform_host_log)
set_tests_properties(test_platform_host_log PROPERTIES LABELS "no_hardware")

# host_build_graph Runtime test
set(HBG_RUNTIME_DIR ${CMAKE_SOURCE_DIR}/../../../src/a2a3/runtime/host_build_graph/runtime)
add_executable(test_runtime_graph
pto2_a2a3/test_runtime_graph.cpp
${HBG_RUNTIME_DIR}/runtime.cpp
${PTO2_STUB_SOURCES}
)
target_include_directories(test_runtime_graph PRIVATE
${GTEST_INCLUDE_DIRS}
${HBG_RUNTIME_DIR}
${CMAKE_SOURCE_DIR}/../../../src/a2a3/platform/include
${CMAKE_SOURCE_DIR}/../../../src/common/task_interface
)
target_compile_options(test_runtime_graph PRIVATE -D_GLIBCXX_USE_CXX11_ABI=0)
target_link_libraries(test_runtime_graph PRIVATE
${GTEST_MAIN_LIB}
${GTEST_LIB}
pthread
)
add_test(NAME test_runtime_graph COMMAND test_runtime_graph)
set_tests_properties(test_runtime_graph PROPERTIES LABELS "no_hardware")

# Hardware-gated tests. Block is only entered when the project is configured
# with -DSIMPLER_ENABLE_HARDWARE_TESTS=ON. CI's no-hw `ut` job does not pass
Expand Down Expand Up @@ -238,5 +372,5 @@ if(SIMPLER_ENABLE_HARDWARE_TESTS)
)
endfunction()

add_comm_api_test(test_hccl_comm test_hccl_comm.cpp)
add_comm_api_test(test_hccl_comm hardware/test_hccl_comm.cpp)
endif()
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@
/*
* Hardware UT guarding the CANN/HCCL-private ABI coupling in comm_hccl.cpp.
*
* The call chain (dlopen create_device_context ensure_acl_ready_ctx
* aclrtCreateStream comm_init comm_alloc_windows ...) is not the
* interesting part the interesting part is *what's inside* CommContext
* The call chain (dlopen -> create_device_context -> ensure_acl_ready_ctx ->
* aclrtCreateStream -> comm_init -> comm_alloc_windows -> ...) is not the
* interesting part -- the interesting part is *what's inside* CommContext
* after comm_alloc_windows returns. That struct comes from one of:
*
* - MESH topology: `reinterpret_cast<CommContext*>(HCCL's return ptr)`
* - MESH topology: `reinterpret_cast<CommContext*>(HCCL's return ptr)` --
* our layout is *assumed* to match HCCL's internal MESH context.
* - RING topology: our parser reads HcclOpResParam / HcclRankRelationResV2
* field-by-field using offsetof against reverse-engineered struct defs.
Expand All @@ -40,7 +40,7 @@
* gate SIMPLER_ENABLE_HARDWARE_TESTS. Device allocation is driven by
* CTest RESOURCE_GROUPS + --resource-spec-file.
*
* Linking strategy: libhost_runtime.so is dlopen'd it is the subject
* Linking strategy: libhost_runtime.so is dlopen'd -- it is the subject
* under test and mirrors how ChipWorker loads a runtime backend in
* production. libascendcl.so is linked directly at compile time because
* it is generic CANN infra; going through dlsym for acl* here buys nothing
Expand Down Expand Up @@ -122,14 +122,14 @@ constexpr int EXIT_WINDOW_SIZE = 50;
// the CommContext returned by HCCL (MESH reinterpret_cast) or built by our
// RING parser actually contains the fields we expect at the offsets we
// expect. Failure here means our reverse-engineered CANN ABI disagrees with
// the live HCCL build the CANN-coupling fragility this test is here for.
// the live HCCL build -- the CANN-coupling fragility this test is here for.
constexpr int EXIT_CTX_MEMCPY = 55;
constexpr int EXIT_CTX_FIELDS = 56;
constexpr int EXIT_BARRIER = 60;
constexpr int EXIT_DESTROY = 70;

int run_rank(int rank, int nranks, int device_id, const char *rootinfo_path) {
// libhost_runtime.so is the subject under test dlopen mirrors
// libhost_runtime.so is the subject under test -- dlopen mirrors
// ChipWorker. libascendcl is linked in, so acl* is available directly.
void *host_handle = dlopen(PTO_HOST_RUNTIME_LIB_PATH, RTLD_NOW | RTLD_LOCAL);
if (host_handle == nullptr) {
Expand Down Expand Up @@ -215,7 +215,7 @@ int run_rank(int rank, int nranks, int device_id, const char *rootinfo_path) {
host_ctx.windowsIn[rank] != local_base) {
fprintf(
stderr,
"[rank %d] CommContext field mismatch CANN ABI drift?\n"
"[rank %d] CommContext field mismatch -- CANN ABI drift?\n"
" got: rankId=%u rankNum=%u winSize=%lu windowsIn[%d]=0x%lx\n"
" expected: rankId=%d rankNum=%d winSize=%zu windowsIn[%d]=0x%lx\n",
rank, host_ctx.rankId, host_ctx.rankNum, static_cast<unsigned long>(host_ctx.winSize), rank,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ struct OrchestratorFixture : public ::testing::Test {

void TearDown() override { allocator.shutdown(); }

// Per-slot accessor slot state lives inside the Ring now.
// Per-slot accessor -- slot state lives inside the Ring now.
TaskSlotState &S(TaskSlot id) { return *allocator.slot_state(id); }

// Helper: build a TaskArgs whose only tensor has the given (data, tag).
Expand Down Expand Up @@ -86,7 +86,7 @@ TEST_F(OrchestratorFixture, DependentTaskIsPending) {
TaskSlot a_slot;
rq.try_pop(a_slot);

// Task B reads INPUT at the same key depends on A
// Task B reads INPUT at the same key -- depends on A
auto args_b = single_tensor_args(0xBEEF, TensorArgType::INPUT);
auto b = orch.submit_next_level(0xDEAD, args_b, cfg);
EXPECT_EQ(S(b.task_slot).state.load(), TaskState::PENDING);
Expand Down Expand Up @@ -151,7 +151,7 @@ TEST_F(OrchestratorFixture, NoDepTagSkipsDependencyTracking) {
TaskSlot drain_slot;
rq.try_pop(drain_slot);

// Second task references same key but tagged NO_DEP should be independent
// Second task references same key but tagged NO_DEP -- should be independent
auto args_b = single_tensor_args(0xAAAA, TensorArgType::NO_DEP);
auto b = orch.submit_next_level(0xDEAD, args_b, cfg);
EXPECT_EQ(S(b.task_slot).state.load(), TaskState::READY);
Expand Down Expand Up @@ -215,7 +215,7 @@ TEST_F(OrchestratorFixture, OutputAutoAllocsFromHeapRing) {

TEST_F(OrchestratorFixture, InoutWiresCreatorAsFanin) {
// INOUT is the only tag that pulls in the prior writer as a fanin
// producer matching L2's pto_orchestrator.cpp Step B where only
// producer -- matching L2's pto_orchestrator.cpp Step B where only
// INPUT / INOUT do tensor_map.lookup. Users who want a WaW dep on
// the alloc-slot (so its HeapRing slab stays live while they write)
// must tag the buffer INOUT.
Expand Down Expand Up @@ -250,7 +250,7 @@ TEST_F(OrchestratorFixture, InoutWiresCreatorAsFanin) {

TEST_F(OrchestratorFixture, OutputAndOutputExistingAreInsertOnly) {
// Contrast with INOUT: plain OUTPUT and OUTPUT_EXISTING are pure
// overwrites insert into TensorMap, no lookup, so no fanin wire
// overwrites -- insert into TensorMap, no lookup, so no fanin wire
// on the prior writer. Matches L2 semantics for both tags. Users
// who need creator lifetime must tag the buffer INOUT.
struct Case {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ TEST(Ring, SlotStateIsPointerStable) {
TaskSlotState *p0 = a.slot_state(r0.slot);
ASSERT_NE(p0, nullptr);

// Push many more slots through the deque may grow/chain, but the
// Push many more slots through -- the deque may grow/chain, but the
// pointer we grabbed for slot 0 has to stay valid.
for (int i = 0; i < 1000; ++i) {
(void)a.alloc();
Expand Down Expand Up @@ -227,7 +227,7 @@ TEST(Ring, ScopeDepthMapsToRingIdx) {
}

TEST(Ring, PerRingHeapsAreDistinctMmaps) {
// Total VA = 4 × 4 KiB; verify each ring has its own mapping.
// Total VA = 4 x 4 KiB; verify each ring has its own mapping.
Ring a;
a.init(kSmallHeap, kQuickTimeoutMs);

Expand All @@ -241,7 +241,7 @@ TEST(Ring, PerRingHeapsAreDistinctMmaps) {
for (int i = 0; i < MAX_RING_DEPTH; ++i) {
for (int j = i + 1; j < MAX_RING_DEPTH; ++j) {
EXPECT_NE(bases[i], bases[j])
<< "rings " << i << " and " << j << " share a mapping expected 4 separate mmaps";
<< "rings " << i << " and " << j << " share a mapping -- expected 4 separate mmaps";
}
}
}
Expand Down Expand Up @@ -292,7 +292,7 @@ TEST(Ring, RingsReclaimIndependently) {
EXPECT_EQ(r1a.ring_idx, 1);
EXPECT_EQ(r1b.ring_idx, 1);

// Ring 0 is untouched this must succeed instantly, not time out.
// Ring 0 is untouched -- this must succeed instantly, not time out.
auto r0 = a.alloc(HEAP_ALIGN, /*scope_depth=*/0);
EXPECT_EQ(r0.ring_idx, 0);
ASSERT_NE(r0.heap_ptr, nullptr);
Expand Down Expand Up @@ -322,7 +322,7 @@ TEST(Ring, InnerRingReclaimsWhileOuterHolds) {
EXPECT_EQ(a.heap_top(0), HEAP_ALIGN);
EXPECT_EQ(a.heap_tail(0), 0u);

// Churn on the inner ring allocate, release, allocate, release, ...
// Churn on the inner ring -- allocate, release, allocate, release, ...
for (int i = 0; i < 8; ++i) {
auto inner = a.alloc(HEAP_ALIGN, /*scope_depth=*/1);
a.release(inner.slot);
Expand All @@ -331,7 +331,7 @@ TEST(Ring, InnerRingReclaimsWhileOuterHolds) {
// Outer ring unchanged (one live slab at offset 0).
EXPECT_EQ(a.heap_top(0), HEAP_ALIGN);
EXPECT_EQ(a.heap_tail(0), 0u);
// Inner ring reclaimed each slab tail caught up to top.
// Inner ring reclaimed each slab -- tail caught up to top.
EXPECT_EQ(a.heap_tail(1), a.heap_top(1));

a.release(outer.slot);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ TEST_F(SchedulerFixture, DependentTaskDispatchedAfterProducerCompletes) {
}

// ===========================================================================
// Group task tests fixture with 2 MockWorkers
// Group task tests -- fixture with 2 MockWorkers
// ===========================================================================

struct GroupSchedulerFixture : public ::testing::Test {
Expand Down Expand Up @@ -405,7 +405,7 @@ TEST_F(MixedTypeSchedulerFixture, SubTaskDispatchesWhileNextLevelPoolSaturated)
EXPECT_TRUE(next_level_worker.is_running.load()) << "chip worker must still be busy";

// Complete the sub task first; it reaches CONSUMED while the chip task
// is still running demonstrating independent per-type dispatch.
// is still running -- demonstrating independent per-type dispatch.
sub_worker.complete();
wait_consumed(sub.task_slot);
EXPECT_FALSE(is_consumed(chip.task_slot));
Expand All @@ -416,7 +416,7 @@ TEST_F(MixedTypeSchedulerFixture, SubTaskDispatchesWhileNextLevelPoolSaturated)

TEST_F(GroupSchedulerFixture, GroupDependencyChain) {
// Group A (2 workers) produces an OUTPUT at key 0xCAFE.
// Task B reads INPUT at the same key depends on group A.
// Task B reads INPUT at the same key -- depends on group A.
TaskArgs a0 = single_tensor_args(0xCAFE, TensorArgType::OUTPUT);
TaskArgs a1 = single_tensor_args(0xCAFE, TensorArgType::OUTPUT);
auto a = orch.submit_next_level_group(0xDEAD, {a0, a1}, cfg);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ TEST(Scope, SingleScope_ReleasesRegisteredTasks) {

TEST(Scope, RegisterOutsideScopeIsNoop) {
Scope sc;
sc.register_task(5); // no open scope should not throw
sc.register_task(5); // no open scope -- should not throw
EXPECT_EQ(sc.depth(), 0);
}

Expand Down
Loading
Loading