Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ jobs:
source .venv/bin/activate
source ${ASCEND_HOME_PATH}/bin/setenv.bash
set -e
python -m pytest tests -m requires_hardware --platform a2a3 -v
python -m pytest tests -m requires_hardware --platform a2a3 --device ${DEVICE_RANGE} -v

- name: Build and run C++ hardware unit tests
run: |
Expand Down
25 changes: 24 additions & 1 deletion python/bindings/task_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -618,7 +618,30 @@ NB_MODULE(_task_interface, m) {
.def("malloc", &ChipWorker::malloc, nb::arg("size"))
.def("free", &ChipWorker::free, nb::arg("ptr"))
.def("copy_to", &ChipWorker::copy_to, nb::arg("dst"), nb::arg("src"), nb::arg("size"))
.def("copy_from", &ChipWorker::copy_from, nb::arg("dst"), nb::arg("src"), nb::arg("size"));
.def("copy_from", &ChipWorker::copy_from, nb::arg("dst"), nb::arg("src"), nb::arg("size"))
.def(
"comm_init", &ChipWorker::comm_init, nb::arg("rank"), nb::arg("nranks"), nb::arg("rootinfo_path"),
"Initialize a communicator for this rank. ChipWorker owns ACL + stream "
"lifetime internally (onboard drives ensure_acl_ready + aclrtCreateStream; "
"sim ignores both). Pair with comm_destroy for cleanup."
)
.def(
"comm_alloc_windows", &ChipWorker::comm_alloc_windows, nb::arg("comm_handle"), nb::arg("win_size"),
"Allocate per-rank windows and return the device CommContext pointer."
)
.def(
"comm_get_local_window_base", &ChipWorker::comm_get_local_window_base, nb::arg("comm_handle"),
"Return this rank's local window base address."
)
.def(
"comm_get_window_size", &ChipWorker::comm_get_window_size, nb::arg("comm_handle"),
"Return the actual per-rank window size (may differ from the hint)."
)
.def("comm_barrier", &ChipWorker::comm_barrier, nb::arg("comm_handle"), "Synchronize all ranks.")
.def(
"comm_destroy", &ChipWorker::comm_destroy, nb::arg("comm_handle"),
"Destroy the communicator and release its resources."
);

// --- Standalone blob helpers ---
m.def(
Expand Down
38 changes: 38 additions & 0 deletions python/simpler/task_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,44 @@ def copy_from(self, dst, src, size):
"""Copy *size* bytes from worker *src* to host *dst*."""
self._impl.copy_from(int(dst), int(src), int(size))

def comm_init(self, rank: int, nranks: int, rootinfo_path: str) -> int:
"""Initialize a distributed communicator for this rank.

ChipWorker owns ACL bring-up and the aclrtStream internally, so
callers never touch ``aclInit`` / ``aclrtSetDevice`` / stream
lifetimes. On sim, ACL / stream are not used. Pair with
``comm_destroy`` for teardown.

Args:
rank: This process's rank (0-based).
nranks: Total number of ranks.
rootinfo_path: Filesystem path used for rank handshake.

Returns:
Opaque communicator handle (uint64) for the other ``comm_*`` calls.
"""
return int(self._impl.comm_init(int(rank), int(nranks), str(rootinfo_path)))

def comm_alloc_windows(self, comm_handle: int, win_size: int) -> int:
"""Allocate per-rank windows. Returns a device CommContext pointer (uint64)."""
return int(self._impl.comm_alloc_windows(int(comm_handle), int(win_size)))

def comm_get_local_window_base(self, comm_handle: int) -> int:
"""Return this rank's local window base address (uint64)."""
return int(self._impl.comm_get_local_window_base(int(comm_handle)))

def comm_get_window_size(self, comm_handle: int) -> int:
"""Return the actual per-rank window size in bytes."""
return int(self._impl.comm_get_window_size(int(comm_handle)))

def comm_barrier(self, comm_handle: int) -> None:
"""Synchronize all ranks."""
self._impl.comm_barrier(int(comm_handle))

def comm_destroy(self, comm_handle: int) -> None:
"""Destroy the communicator and release its resources."""
self._impl.comm_destroy(int(comm_handle))

@property
def device_id(self):
return self._impl.device_id
Expand Down
2 changes: 1 addition & 1 deletion src/a2a3/platform/include/host/memory_allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ class MemoryAllocator {
* @return Number of currently tracked pointers
*/
size_t get_allocation_count() const {
std::lock_guard<std::mutex> lk(mu_);
std::scoped_lock lk(mu_);
return ptr_set_.size();
}

Expand Down
4 changes: 4 additions & 0 deletions src/a2a3/platform/onboard/host/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,10 @@ target_include_directories(host_runtime
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}
${CMAKE_CURRENT_SOURCE_DIR}/../../include
# Shared platform_comm headers (comm.h / comm_context.h) live in
# src/common so a2a3 (HCCL) and a5/a2a3 sim (POSIX-shm) can use the
# same contract.
${CMAKE_CURRENT_SOURCE_DIR}/../../../../common
${CMAKE_CUSTOM_INCLUDE_DIRS}
PRIVATE
${ASCEND_HOME_PATH}/include
Expand Down
4 changes: 2 additions & 2 deletions src/a2a3/platform/onboard/host/comm_hccl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
* when extracting per-rank RDMA window addresses.
*/

#include "host/comm.h"
#include "common/comm_context.h"
#include "platform_comm/comm.h"
#include "platform_comm/comm_context.h"

#include <chrono>
#include <cstdio>
Expand Down
32 changes: 32 additions & 0 deletions src/a2a3/platform/onboard/host/device_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,38 @@ int DeviceRunner::ensure_acl_ready(int device_id) {
return 0;
}

void *DeviceRunner::create_comm_stream() {
aclrtStream stream = nullptr;
aclError aRet = aclrtCreateStream(&stream);
if (aRet != ACL_SUCCESS) {
LOG_ERROR("aclrtCreateStream failed: %d", static_cast<int>(aRet));
return nullptr;
}
return stream;
}

int DeviceRunner::destroy_comm_stream(void *stream) {
if (stream == nullptr) return 0;

// Best-effort teardown. HcclBarrier submits async work on the stream;
// if the caller never blocked for completion (or hit the L1a 507018
// barrier regression), aclrtDestroyStream will refuse with 507901
// ("stream still has pending tasks"). We try to drain first, then
// destroy anyway, and log failures without propagating them — leaking
// a stream at teardown is strictly better than failing the teardown
// itself, which would block device finalization. This matches the
// cleanup behavior of the L1a C++ hardware UT.
aclError sync_rc = aclrtSynchronizeStream(static_cast<aclrtStream>(stream));
if (sync_rc != ACL_SUCCESS) {
LOG_ERROR("aclrtSynchronizeStream during stream teardown failed: %d", static_cast<int>(sync_rc));
}
aclError destroy_rc = aclrtDestroyStream(static_cast<aclrtStream>(stream));
if (destroy_rc != ACL_SUCCESS) {
LOG_ERROR("aclrtDestroyStream failed (leaking stream): %d", static_cast<int>(destroy_rc));
}
return 0;
}

int DeviceRunner::prepare_run_context(int device_id) {
int rc = attach_current_thread(device_id);
if (rc != 0) {
Expand Down
20 changes: 20 additions & 0 deletions src/a2a3/platform/onboard/host/device_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,26 @@ class DeviceRunner {
*/
int ensure_acl_ready(int device_id);

/**
* Create a caller-owned aclrtStream for comm_* usage.
*
* Intended to back the ChipWorker Python wrapper's internal stream
* ownership for distributed comm — callers pair it with
* destroy_comm_stream() at teardown. The ACL context must already be
* ready on the calling thread (ensure_acl_ready()).
*
* @return aclrtStream pointer on success, NULL on failure.
*/
void *create_comm_stream();

/**
* Destroy a stream previously returned by create_comm_stream().
* Tolerates a nullptr stream (returns 0).
*
* @return 0 on success, error code on failure.
*/
int destroy_comm_stream(void *stream);

/**
* Ensure the current thread has fresh run-scoped streams.
*
Expand Down
24 changes: 24 additions & 0 deletions src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,30 @@ int ensure_acl_ready_ctx(DeviceContextHandle ctx, int device_id) {
}
}

/*
* Stream creation/destruction exposed so the ChipWorker Python wrapper can
* drive comm_init end-to-end without leaking aclrtStream lifetime (or ACL
* libs) into Python. Both entries go through the DeviceRunner so the ACL
* ready-flag and device bookkeeping stay consistent with the normal run path.
*/
void *create_comm_stream_ctx(DeviceContextHandle ctx) {
if (ctx == NULL) return NULL;
try {
return static_cast<DeviceRunner *>(ctx)->create_comm_stream();
} catch (...) {
return NULL;
}
}

int destroy_comm_stream_ctx(DeviceContextHandle ctx, void *stream) {
if (ctx == NULL) return -1;
try {
return static_cast<DeviceRunner *>(ctx)->destroy_comm_stream(stream);
} catch (...) {
return -1;
}
}

void *device_malloc_ctx(DeviceContextHandle ctx, size_t size) {
if (ctx == NULL) return NULL;
try {
Expand Down
9 changes: 9 additions & 0 deletions src/a2a3/platform/sim/host/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ list(APPEND HOST_RUNTIME_SOURCES
"${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/performance_collector.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/tensor_dump_collector.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/../aicpu/platform_aicpu_affinity.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/platform_comm/comm_sim.cpp"
)

if(DEFINED CUSTOM_SOURCE_DIRS)
Expand Down Expand Up @@ -81,6 +82,9 @@ target_include_directories(host_runtime
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}
${CMAKE_CURRENT_SOURCE_DIR}/../../include
# Shared platform_comm headers so comm_sim.cpp (in src/common) resolves
# its #include "platform_comm/comm.h" / "platform_comm/comm_context.h".
${CMAKE_CURRENT_SOURCE_DIR}/../../../../common
${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/sim_context
${CMAKE_CUSTOM_INCLUDE_DIRS}
)
Expand All @@ -92,6 +96,11 @@ target_link_libraries(host_runtime
dl
)

# POSIX shm_open / shm_unlink live in libSystem on macOS but require -lrt on Linux.
if(UNIX AND NOT APPLE)
target_link_libraries(host_runtime PRIVATE rt)
endif()

# Allow undefined symbols from libcpu_sim_context.so (loaded with RTLD_GLOBAL at runtime).
# On macOS, the linker requires -undefined dynamic_lookup; on Linux/gcc this is the default.
if(APPLE)
Expand Down
24 changes: 24 additions & 0 deletions src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,30 @@ int finalize_device(DeviceContextHandle ctx) {
}
}

/* ===========================================================================
* ACL lifecycle stubs. Sim has no ACL / aclrtStream concept, so these
* no-op to satisfy the uniform host_runtime.so ABI (ChipWorker dlsym's the
* full extension surface unconditionally). The paired comm_init / barrier /
* destroy entry points already live in comm_sim.cpp.
* =========================================================================== */

int ensure_acl_ready_ctx(DeviceContextHandle ctx, int device_id) {
(void)ctx;
(void)device_id;
return 0;
}

void *create_comm_stream_ctx(DeviceContextHandle ctx) {
(void)ctx;
return NULL;
}

int destroy_comm_stream_ctx(DeviceContextHandle ctx, void *stream) {
(void)ctx;
(void)stream;
return 0;
}

/* ===========================================================================
* Internal helpers called from runtime_maker.cpp via Runtime.host_api
* =========================================================================== */
Expand Down
2 changes: 1 addition & 1 deletion src/a5/platform/include/host/memory_allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ class MemoryAllocator {
* @return Number of currently tracked pointers
*/
size_t get_allocation_count() const {
std::lock_guard<std::mutex> lk(mu_);
std::scoped_lock lk(mu_);
return ptr_set_.size();
}

Expand Down
65 changes: 65 additions & 0 deletions src/a5/platform/onboard/host/pto_runtime_c_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,71 @@ int finalize_device(DeviceContextHandle ctx) {
}
}

/* ===========================================================================
* ACL + comm_* placeholders (distributed runtime not yet implemented on a5)
*
* These exist only to satisfy ChipWorker's unconditional dlsym of the extension
* surface — the contract is "every host_runtime.so exports the full set; a
* runtime without a real implementation returns a not-supported result at
* call time" rather than having ChipWorker probe each symbol individually.
* When a5 grows real HCCL / sim distributed support these stubs get replaced
* wholesale; no ChipWorker changes are needed.
* =========================================================================== */

int ensure_acl_ready_ctx(DeviceContextHandle ctx, int device_id) {
(void)ctx;
(void)device_id;
return 0;
}

void *create_comm_stream_ctx(DeviceContextHandle ctx) {
(void)ctx;
return NULL;
}

int destroy_comm_stream_ctx(DeviceContextHandle ctx, void *stream) {
(void)ctx;
(void)stream;
return 0;
}

void *comm_init(int rank, int nranks, void *stream, const char *rootinfo_path) {
(void)rank;
(void)nranks;
(void)stream;
(void)rootinfo_path;
return NULL; // distributed runtime not yet supported on a5
}

int comm_alloc_windows(void *handle, size_t win_size, uint64_t *device_ctx_out) {
(void)handle;
(void)win_size;
(void)device_ctx_out;
return -1;
}

int comm_get_local_window_base(void *handle, uint64_t *base_out) {
(void)handle;
(void)base_out;
return -1;
}

int comm_get_window_size(void *handle, size_t *size_out) {
(void)handle;
(void)size_out;
return -1;
}

int comm_barrier(void *handle) {
(void)handle;
return -1;
}

int comm_destroy(void *handle) {
(void)handle;
return -1;
}

/* ===========================================================================
* Internal helpers called from runtime_maker.cpp via Runtime.host_api
* =========================================================================== */
Expand Down
10 changes: 10 additions & 0 deletions src/a5/platform/sim/host/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ list(APPEND HOST_RUNTIME_SOURCES
"${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/performance_collector.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/tensor_dump_collector.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/../aicpu/platform_aicpu_affinity.cpp"
# Shared POSIX-shm sim comm backend (same source as a2a3 sim).
"${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/platform_comm/comm_sim.cpp"
)

if(DEFINED CUSTOM_SOURCE_DIRS)
Expand Down Expand Up @@ -81,6 +83,9 @@ target_include_directories(host_runtime
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}
${CMAKE_CURRENT_SOURCE_DIR}/../../include
# Shared platform_comm headers so the common comm_sim.cpp resolves
# its #include "platform_comm/comm.h" / "platform_comm/comm_context.h".
${CMAKE_CURRENT_SOURCE_DIR}/../../../../common
${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/sim_context
${CMAKE_CUSTOM_INCLUDE_DIRS}
)
Expand All @@ -92,6 +97,11 @@ target_link_libraries(host_runtime
dl
)

# POSIX shm_open / shm_unlink live in libSystem on macOS but require -lrt on Linux.
if(UNIX AND NOT APPLE)
target_link_libraries(host_runtime PRIVATE rt)
endif()

# Allow undefined symbols from libcpu_sim_context.so (loaded with RTLD_GLOBAL at runtime).
# On macOS, the linker requires -undefined dynamic_lookup; on Linux/gcc this is the default.
if(APPLE)
Expand Down
Loading
Loading