Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 71 additions & 13 deletions tests/ut/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,49 @@ if(NOT GTEST_LIB OR NOT GTEST_MAIN_LIB)
set(GTEST_INCLUDE_DIRS "") # include dirs are carried by the gtest target
endif()

# ---------------------------------------------------------------------------
# A2A3 runtime sources and stubs for ring-buffer / tensormap tests
# ---------------------------------------------------------------------------
set(A2A3_RUNTIME_DIR ${CMAKE_SOURCE_DIR}/../../../src/a2a3/runtime/tensormap_and_ringbuffer/runtime)
set(A2A3_STUB_SOURCES ${CMAKE_SOURCE_DIR}/stubs/test_stubs.cpp)
set(A2A3_RUNTIME_SOURCES
${A2A3_RUNTIME_DIR}/pto_ring_buffer.cpp
${A2A3_RUNTIME_DIR}/pto_shared_memory.cpp
${A2A3_RUNTIME_DIR}/pto_scheduler.cpp
${A2A3_RUNTIME_DIR}/pto_tensormap.cpp
)

set(A2A3_COMMON_INCLUDE_DIRS
${CMAKE_SOURCE_DIR}/../../../src/a2a3/runtime/tensormap_and_ringbuffer/orchestration
${CMAKE_SOURCE_DIR}/../../../src/a2a3/runtime/tensormap_and_ringbuffer/runtime
${CMAKE_SOURCE_DIR}/../../../src/a2a3/runtime/tensormap_and_ringbuffer/common
${CMAKE_SOURCE_DIR}/../../../src/a2a3/platform/include
${CMAKE_SOURCE_DIR}/../../../src/common/task_interface
)

function(add_a2a3_runtime_test name)
cmake_parse_arguments(ARG "" "" "SOURCES;EXTRA_SOURCES" ${ARGN})
set(_all_sources ${ARG_SOURCES} ${A2A3_STUB_SOURCES})
foreach(src ${ARG_SOURCES} ${ARG_EXTRA_SOURCES})
if(EXISTS ${src})
list(APPEND _all_sources ${src})
endif()
endforeach()
add_executable(${name} ${_all_sources})
target_include_directories(${name} PRIVATE
${GTEST_INCLUDE_DIRS}
${A2A3_COMMON_INCLUDE_DIRS}
)
target_compile_options(${name} PRIVATE -D_GLIBCXX_USE_CXX11_ABI=0)
target_link_libraries(${name} PRIVATE
${GTEST_MAIN_LIB}
${GTEST_LIB}
pthread
)
add_test(NAME ${name} COMMAND ${name})
set_tests_properties(${name} PROPERTIES LABELS "no_hardware")
endfunction()

# ---------------------------------------------------------------------------
# Distributed runtime sources under test
# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -113,7 +156,7 @@ function(add_hierarchical_test name src)
add_test(NAME ${name} COMMAND ${name})
endfunction()

function(add_a2a3_pto2_test name src)
function(add_a2a3_test name src)
add_executable(${name} ${src})
target_include_directories(${name} PRIVATE
${GTEST_INCLUDE_DIRS}
Expand All @@ -132,7 +175,7 @@ function(add_a2a3_pto2_test name src)
add_test(NAME ${name} COMMAND ${name})
endfunction()

function(add_a5_pto2_test name src)
function(add_a5_test name src)
add_executable(${name} ${src})
target_include_directories(${name} PRIVATE
${GTEST_INCLUDE_DIRS}
Expand All @@ -151,13 +194,6 @@ function(add_a5_pto2_test name src)
add_test(NAME ${name} COMMAND ${name})
endfunction()

enable_testing()

add_hierarchical_test(test_tensormap test_tensormap.cpp)
add_hierarchical_test(test_ring test_ring.cpp)
add_hierarchical_test(test_scope test_scope.cpp)
add_hierarchical_test(test_orchestrator test_orchestrator.cpp)
add_hierarchical_test(test_scheduler test_scheduler.cpp)
function(add_task_interface_test name src)
add_executable(${name} ${src})
target_include_directories(${name} PRIVATE
Expand All @@ -173,9 +209,31 @@ function(add_task_interface_test name src)
add_test(NAME ${name} COMMAND ${name})
endfunction()

add_task_interface_test(test_child_memory test_child_memory.cpp)
add_a2a3_pto2_test(test_a2a3_pto2_fatal test_a2a3_pto2_fatal.cpp)
add_a5_pto2_test(test_a5_pto2_fatal test_a5_pto2_fatal.cpp)
enable_testing()

# ---------------------------------------------------------------------------
# Hierarchical runtime tests (src/common/hierarchical/)
# ---------------------------------------------------------------------------
add_hierarchical_test(test_tensormap hierarchical/test_tensormap.cpp)
add_hierarchical_test(test_ring hierarchical/test_ring.cpp)
add_hierarchical_test(test_scope hierarchical/test_scope.cpp)
add_hierarchical_test(test_orchestrator hierarchical/test_orchestrator.cpp)
add_hierarchical_test(test_scheduler hierarchical/test_scheduler.cpp)

# ---------------------------------------------------------------------------
# Types / task_interface tests (src/common/task_interface/)
# ---------------------------------------------------------------------------
add_task_interface_test(test_child_memory types/test_child_memory.cpp)

# ---------------------------------------------------------------------------
# A2A3 tests (src/a2a3/runtime/tensormap_and_ringbuffer/)
# ---------------------------------------------------------------------------
add_a2a3_test(test_a2a3_fatal a2a3/test_a2a3_fatal.cpp)

# ---------------------------------------------------------------------------
# A5 tests (src/a5/runtime/tensormap_and_ringbuffer/)
# ---------------------------------------------------------------------------
add_a5_test(test_a5_fatal a5/test_a5_fatal.cpp)

# Hardware-gated tests. Block is only entered when the project is configured
# with -DSIMPLER_ENABLE_HARDWARE_TESTS=ON. CI's no-hw `ut` job does not pass
Expand Down Expand Up @@ -238,5 +296,5 @@ if(SIMPLER_ENABLE_HARDWARE_TESTS)
)
endfunction()

add_comm_api_test(test_hccl_comm test_hccl_comm.cpp)
add_comm_api_test(test_hccl_comm hardware/test_hccl_comm.cpp)
endif()
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ struct FakeRuntime {
std::string last_fatal_message;
};

static_assert(offsetof(FakeRuntime, ops) == 0); // Guard: reinterpret_cast below assumes ops is first member.

FakeRuntime *as_fake(PTO2Runtime *rt) { return reinterpret_cast<FakeRuntime *>(rt); }

TaskOutputTensors fake_submit(PTO2Runtime *rt, const MixedKernels &, const Arg &) {
Expand Down Expand Up @@ -87,20 +89,20 @@ TaskOutputTensors fake_alloc_tensors(PTO2Runtime *rt, const Arg &) {
}

const PTO2RuntimeOps kFakeOps = {
fake_submit,
fake_scope_begin,
fake_scope_end,
fake_orchestration_done,
fake_is_fatal,
fake_report_fatal,
fake_log,
fake_log,
fake_log,
fake_log,
fake_log,
fake_get_tensor_data,
fake_set_tensor_data,
fake_alloc_tensors,
.submit_task = fake_submit,
.scope_begin = fake_scope_begin,
.scope_end = fake_scope_end,
.orchestration_done = fake_orchestration_done,
.is_fatal = fake_is_fatal,
.report_fatal = fake_report_fatal,
.log_error = fake_log,
.log_warn = fake_log,
.log_info = fake_log,
.log_debug = fake_log,
.log_always = fake_log,
.get_tensor_data = fake_get_tensor_data,
.set_tensor_data = fake_set_tensor_data,
.alloc_tensors = fake_alloc_tensors,
};

class RuntimeBindingGuard {
Expand All @@ -116,7 +118,7 @@ TensorCreateInfo make_ci() {

} // namespace

TEST(A5PTO2Fatal, ApiShortCircuitsAfterFatal) {
TEST(A2A3Fatal, ApiShortCircuitsAfterFatal) {
FakeRuntime runtime{};
runtime.ops = &kFakeOps;
runtime.fatal = true;
Expand Down Expand Up @@ -148,7 +150,7 @@ TEST(A5PTO2Fatal, ApiShortCircuitsAfterFatal) {
EXPECT_EQ(runtime.report_fatal_calls, 0);
}

TEST(A5PTO2Fatal, ExplicitFatalRoutesThroughOps) {
TEST(A2A3Fatal, ExplicitFatalRoutesThroughOps) {
FakeRuntime runtime{};
runtime.ops = &kFakeOps;
RuntimeBindingGuard bind(reinterpret_cast<PTO2Runtime *>(&runtime));
Expand All @@ -167,7 +169,7 @@ TEST(A5PTO2Fatal, ExplicitFatalRoutesThroughOps) {
EXPECT_EQ(runtime.submit_calls, 0);
}

TEST(A5PTO2Fatal, AllocTensorConvenienceReportsInvalidArgsInsteadOfAsserting) {
TEST(A2A3Fatal, AllocTensorConvenienceReportsInvalidArgsInsteadOfAsserting) {
FakeRuntime runtime{};
runtime.ops = &kFakeOps;
RuntimeBindingGuard bind(reinterpret_cast<PTO2Runtime *>(&runtime));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ struct FakeRuntime {
std::string last_fatal_message;
};

static_assert(offsetof(FakeRuntime, ops) == 0); // Guard: reinterpret_cast below assumes ops is first member.

FakeRuntime *as_fake(PTO2Runtime *rt) { return reinterpret_cast<FakeRuntime *>(rt); }

TaskOutputTensors fake_submit(PTO2Runtime *rt, const MixedKernels &, const Arg &) {
Expand Down Expand Up @@ -87,20 +89,20 @@ TaskOutputTensors fake_alloc_tensors(PTO2Runtime *rt, const Arg &) {
}

const PTO2RuntimeOps kFakeOps = {
fake_submit,
fake_scope_begin,
fake_scope_end,
fake_orchestration_done,
fake_is_fatal,
fake_report_fatal,
fake_log,
fake_log,
fake_log,
fake_log,
fake_log,
fake_get_tensor_data,
fake_set_tensor_data,
fake_alloc_tensors,
.submit_task = fake_submit,
.scope_begin = fake_scope_begin,
.scope_end = fake_scope_end,
.orchestration_done = fake_orchestration_done,
.is_fatal = fake_is_fatal,
.report_fatal = fake_report_fatal,
.log_error = fake_log,
.log_warn = fake_log,
.log_info = fake_log,
.log_debug = fake_log,
.log_always = fake_log,
.get_tensor_data = fake_get_tensor_data,
.set_tensor_data = fake_set_tensor_data,
.alloc_tensors = fake_alloc_tensors,
};

class RuntimeBindingGuard {
Expand All @@ -116,7 +118,7 @@ TensorCreateInfo make_ci() {

} // namespace

TEST(A2A3PTO2Fatal, ApiShortCircuitsAfterFatal) {
TEST(A5Fatal, ApiShortCircuitsAfterFatal) {
FakeRuntime runtime{};
runtime.ops = &kFakeOps;
runtime.fatal = true;
Expand Down Expand Up @@ -148,7 +150,7 @@ TEST(A2A3PTO2Fatal, ApiShortCircuitsAfterFatal) {
EXPECT_EQ(runtime.report_fatal_calls, 0);
}

TEST(A2A3PTO2Fatal, ExplicitFatalRoutesThroughOps) {
TEST(A5Fatal, ExplicitFatalRoutesThroughOps) {
FakeRuntime runtime{};
runtime.ops = &kFakeOps;
RuntimeBindingGuard bind(reinterpret_cast<PTO2Runtime *>(&runtime));
Expand All @@ -167,7 +169,7 @@ TEST(A2A3PTO2Fatal, ExplicitFatalRoutesThroughOps) {
EXPECT_EQ(runtime.submit_calls, 0);
}

TEST(A2A3PTO2Fatal, AllocTensorConvenienceReportsInvalidArgsInsteadOfAsserting) {
TEST(A5Fatal, AllocTensorConvenienceReportsInvalidArgsInsteadOfAsserting) {
FakeRuntime runtime{};
runtime.ops = &kFakeOps;
RuntimeBindingGuard bind(reinterpret_cast<PTO2Runtime *>(&runtime));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@
/*
* Hardware UT guarding the CANN/HCCL-private ABI coupling in comm_hccl.cpp.
*
* The call chain (dlopen create_device_context ensure_acl_ready_ctx
* aclrtCreateStream comm_init comm_alloc_windows ...) is not the
* interesting part the interesting part is *what's inside* CommContext
* The call chain (dlopen -> create_device_context -> ensure_acl_ready_ctx ->
* aclrtCreateStream -> comm_init -> comm_alloc_windows -> ...) is not the
* interesting part -- the interesting part is *what's inside* CommContext
* after comm_alloc_windows returns. That struct comes from one of:
*
* - MESH topology: `reinterpret_cast<CommContext*>(HCCL's return ptr)`
* - MESH topology: `reinterpret_cast<CommContext*>(HCCL's return ptr)` --
* our layout is *assumed* to match HCCL's internal MESH context.
* - RING topology: our parser reads HcclOpResParam / HcclRankRelationResV2
* field-by-field using offsetof against reverse-engineered struct defs.
Expand All @@ -40,7 +40,7 @@
* gate SIMPLER_ENABLE_HARDWARE_TESTS. Device allocation is driven by
* CTest RESOURCE_GROUPS + --resource-spec-file.
*
* Linking strategy: libhost_runtime.so is dlopen'd it is the subject
* Linking strategy: libhost_runtime.so is dlopen'd -- it is the subject
* under test and mirrors how ChipWorker loads a runtime backend in
* production. libascendcl.so is linked directly at compile time because
* it is generic CANN infra; going through dlsym for acl* here buys nothing
Expand Down Expand Up @@ -122,14 +122,14 @@ constexpr int EXIT_WINDOW_SIZE = 50;
// the CommContext returned by HCCL (MESH reinterpret_cast) or built by our
// RING parser actually contains the fields we expect at the offsets we
// expect. Failure here means our reverse-engineered CANN ABI disagrees with
// the live HCCL build the CANN-coupling fragility this test is here for.
// the live HCCL build -- the CANN-coupling fragility this test is here for.
constexpr int EXIT_CTX_MEMCPY = 55;
constexpr int EXIT_CTX_FIELDS = 56;
constexpr int EXIT_BARRIER = 60;
constexpr int EXIT_DESTROY = 70;

int run_rank(int rank, int nranks, int device_id, const char *rootinfo_path) {
// libhost_runtime.so is the subject under test dlopen mirrors
// libhost_runtime.so is the subject under test -- dlopen mirrors
// ChipWorker. libascendcl is linked in, so acl* is available directly.
void *host_handle = dlopen(PTO_HOST_RUNTIME_LIB_PATH, RTLD_NOW | RTLD_LOCAL);
if (host_handle == nullptr) {
Expand Down Expand Up @@ -215,7 +215,7 @@ int run_rank(int rank, int nranks, int device_id, const char *rootinfo_path) {
host_ctx.windowsIn[rank] != local_base) {
fprintf(
stderr,
"[rank %d] CommContext field mismatch CANN ABI drift?\n"
"[rank %d] CommContext field mismatch -- CANN ABI drift?\n"
" got: rankId=%u rankNum=%u winSize=%lu windowsIn[%d]=0x%lx\n"
" expected: rankId=%d rankNum=%d winSize=%zu windowsIn[%d]=0x%lx\n",
rank, host_ctx.rankId, host_ctx.rankNum, static_cast<unsigned long>(host_ctx.winSize), rank,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ struct OrchestratorFixture : public ::testing::Test {

void TearDown() override { allocator.shutdown(); }

// Per-slot accessor slot state lives inside the Ring now.
// Per-slot accessor -- slot state lives inside the Ring now.
TaskSlotState &S(TaskSlot id) { return *allocator.slot_state(id); }

// Helper: build a TaskArgs whose only tensor has the given (data, tag).
Expand Down Expand Up @@ -86,7 +86,7 @@ TEST_F(OrchestratorFixture, DependentTaskIsPending) {
TaskSlot a_slot;
rq.try_pop(a_slot);

// Task B reads INPUT at the same key depends on A
// Task B reads INPUT at the same key -- depends on A
auto args_b = single_tensor_args(0xBEEF, TensorArgType::INPUT);
auto b = orch.submit_next_level(0xDEAD, args_b, cfg);
EXPECT_EQ(S(b.task_slot).state.load(), TaskState::PENDING);
Expand Down Expand Up @@ -151,7 +151,7 @@ TEST_F(OrchestratorFixture, NoDepTagSkipsDependencyTracking) {
TaskSlot drain_slot;
rq.try_pop(drain_slot);

// Second task references same key but tagged NO_DEP should be independent
// Second task references same key but tagged NO_DEP -- should be independent
auto args_b = single_tensor_args(0xAAAA, TensorArgType::NO_DEP);
auto b = orch.submit_next_level(0xDEAD, args_b, cfg);
EXPECT_EQ(S(b.task_slot).state.load(), TaskState::READY);
Expand Down Expand Up @@ -215,7 +215,7 @@ TEST_F(OrchestratorFixture, OutputAutoAllocsFromHeapRing) {

TEST_F(OrchestratorFixture, InoutWiresCreatorAsFanin) {
// INOUT is the only tag that pulls in the prior writer as a fanin
// producer matching L2's pto_orchestrator.cpp Step B where only
// producer -- matching L2's pto_orchestrator.cpp Step B where only
// INPUT / INOUT do tensor_map.lookup. Users who want a WaW dep on
// the alloc-slot (so its HeapRing slab stays live while they write)
// must tag the buffer INOUT.
Expand Down Expand Up @@ -250,7 +250,7 @@ TEST_F(OrchestratorFixture, InoutWiresCreatorAsFanin) {

TEST_F(OrchestratorFixture, OutputAndOutputExistingAreInsertOnly) {
// Contrast with INOUT: plain OUTPUT and OUTPUT_EXISTING are pure
// overwrites insert into TensorMap, no lookup, so no fanin wire
// overwrites -- insert into TensorMap, no lookup, so no fanin wire
// on the prior writer. Matches L2 semantics for both tags. Users
// who need creator lifetime must tag the buffer INOUT.
struct Case {
Expand Down
Loading
Loading